Apollo Easy Scrape 3 avatar
Apollo Easy Scrape 3

Deprecated

Pricing

Pay per usage

Go to Store
Apollo Easy Scrape 3

Apollo Easy Scrape 3

Deprecated

Developed by

Mike Powers

Mike Powers

Maintained by Community

0.0 (0)

Pricing

Pay per usage

0

Total users

181

Monthly users

44

Last modified

a year ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-15",
"title": "Project Puppeteer Crawler JavaScript",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Scrape data from Apollo.io",
"type": "object",
"schemaVersion": 1,
"properties": {
"apollo_username": {
"title": "Apollo Email 📪",
"type": "string",
"description": "Your Apollo Email",
"editor": "textfield"
},
"apollo_password": {
"title": "Apollo Password 🔒",
"type": "string",
"description": "Your Apollo Password",
"editor": "textfield",
"isSecret": true
},
"list_url": {
"title": "List URL from Apollo 📜",
"type": "string",
"description": "The URL to navigate to after login.",
"editor": "textfield"
}
},
"required": ["apollo_username", "apollo_password", "list_url"]
}

src/main.js

1import { Actor } from 'apify';
2import puppeteer from 'puppeteer';
3
4Actor.main(async () => {
5 console.log('Actor starting...');
6
7 // Fetch input using Apify.getInput()
8 const input = await Actor.getInput();
9
10 // Validate input
11 if (!input || !input.apollo_username || !input.apollo_password || !input.list_url) {
12 console.log('Missing required input fields.');
13 return;
14 }
15
16 // Use input values
17 const baseUrl = input.list_url; // Use the list URL from the input
18 const email = input.apollo_username; // Use the Apollo email from the input
19 const password = input.apollo_password; // Use the Apollo password from the input
20
21
22 // Start the Puppeteer browser
23
24 console.time("ScriptRunTime");
25 const browser = await puppeteer.launch({
26 args: ['--no-sandbox'],
27 headless: false, // Set to true to run headless
28 defaultViewport: null
29 });
30
31 const page = await browser.newPage();
32 await page.goto('https://app.apollo.io/#/login');
33 await page.waitForSelector('input[name="email"]', { visible: true });
34 await page.waitForSelector('input[name="password"]', { visible: true });
35 await page.type('input[name="email"]', email);
36 await page.type('input[name="password"]', password);
37 await page.click('button[type="submit"]');
38 await new Promise(resolve => setTimeout(resolve, 2000));
39 await page.goto(baseUrl);
40 await new Promise(resolve => setTimeout(resolve, 2000));
41 const totalText = await page.evaluate(() => {
42 const targetElement = Array.from(document.querySelectorAll('a')).find(e => e.textContent.trim().startsWith('Total'));
43 return targetElement ? targetElement.textContent.trim() : null;
44 });
45
46 let totalItems = 0;
47 if (totalText) {
48 const totalItemsMatch = totalText.match(/\d+/);
49 if (totalItemsMatch) {
50 totalItems = parseInt(totalItemsMatch[0], 10);
51 console.log(`Total items: ${totalItems}`);
52 }
53 }
54 if (totalItems > 0) {
55 const itemsPerPage = 25;
56 const totalPages = Math.ceil(totalItems / itemsPerPage);
57 console.log(`Total pages: ${totalPages}`);
58 let allData = [];
59 for (let i = 1; i <= totalPages; i++) {
60 const pageUrl = `${baseUrl}&page=${i}`;
61 console.log(`Scraping page: ${pageUrl}`);
62 await page.goto(pageUrl);
63 await page.waitForSelector('tbody', { visible: true });
64
65 const data = await page.$$eval('tbody', tbodies => tbodies.map(tbody => {
66 const tr = tbody.querySelector('tr');
67 const tdName = tr ? tr.querySelector('td') : null;
68 let name = tdName ? tdName.innerText.trim() : null;
69 name = name.replace("------", "").trim();
70
71 let parts = name.split(' ');
72 let firstName = parts.shift();
73 let lastName = parts.join(' ');
74
75 const quote = (str) => `"${str.replace(/"/g, '""')}"`;
76
77 firstName = quote(firstName);
78 lastName = quote(lastName);
79 fullName = quote(name);
80
81 const tdJobTitle = tr ? tr.querySelector('td:nth-child(2)') : null;
82 let jobTitle = tdJobTitle ? tdJobTitle.innerText.trim() : '';
83 jobTitle = quote(jobTitle);
84
85 const tdCompanyName = tr ? tr.querySelector('td:nth-child(3)') : null;
86 let companyName = tdCompanyName ? tdCompanyName.innerText.trim() : '';
87 companyName = quote(companyName);
88
89 const tdLocation = tr ? tr.querySelector('td:nth-child(5) .zp_Y6y8d') : null;
90 let location = tdLocation ? tdLocation.innerText.trim() : '';
91 location = quote(location);
92
93 const tdEmployeeCount = tr ? tr.querySelector('td:nth-child(6)') : null;
94 let employeeCount = tdEmployeeCount ? tdEmployeeCount.innerText.trim() : '';
95 employeeCount = quote(employeeCount);
96
97 const tdPhone = tr ? tr.querySelector('td:nth-child(7)') : null;
98 let phone = tdPhone ? tdPhone.innerText.trim() : '';
99 phone = phone.replace(/\D/g, '');
100 phone = phone.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
101 phone = quote(phone);
102
103 const tdIndustry = tr ? tr.querySelector('td:nth-child(8)') : null;
104 let industry = tdIndustry ? tdIndustry.innerText.trim() : '';
105 industry = quote(industry);
106
107 const tdKeywords = tr ? tr.querySelector('td:nth-child(9)') : null;
108 let keywords = tdKeywords ? tdKeywords.innerText.trim() : '';
109 keywords = quote(keywords);
110
111 let facebookUrl = '', twitterUrl = '', companyLinkedinUrl = '', companyUrl = '';
112
113 if (tdCompanyName) {
114 const links = tdCompanyName.querySelectorAll('a[href]');
115 links.forEach(link => {
116 const href = link.href.trim();
117 if (href.includes('facebook.com')) facebookUrl = quote(href);
118 if (href.includes('twitter.com')) twitterUrl = quote(href);
119 else if (href.includes('linkedin.com/company')) companyLinkedinUrl = quote(href);
120 else if (link.querySelector('.apollo-icon-link')) companyUrl = quote(href);
121 });
122 }
123
124 const firstHref = tbody.querySelector('a[href]') ? tbody.querySelector('a[href]').href : '';
125 const linkedinUrl = tdName && tdName.querySelector('a[href*="linkedin.com/in"]') ? tdName.querySelector('a[href*="linkedin.com/in"]').href : '';
126
127 return {
128 firstName: firstName,
129 lastName: lastName,
130 fullName: fullName,
131 jobTitle: jobTitle,
132 companyName: companyName,
133 location: location,
134 employeeCount: employeeCount,
135 phone: phone,
136 industry: industry,
137 firstHref: quote(firstHref),
138 linkedinUrl: quote(linkedinUrl),
139 facebookUrl: facebookUrl,
140 twitterUrl: twitterUrl,
141 companyLinkedinUrl: companyLinkedinUrl,
142 companyUrl: companyUrl,
143 keywords: keywords,
144 };
145 }));
146 allData = allData.concat(data);
147 }
148 async function processPerson(person, newPage) {
149 console.log(`Processing person: ${person.name}`);
150 const cleanedUrl = person.firstHref.replace(/"/g, '');
151 console.log(`Navigating to cleaned URL: ${cleanedUrl}`);
152
153 try {
154 await newPage.goto(cleanedUrl, { waitUntil: 'networkidle0' });
155 console.log(`Page navigated to ${cleanedUrl}`);
156
157 await newPage.waitForSelector('#general_information_card', { timeout: 10000 });
158 console.log(`Found #general_information_card`);
159
160 const emailElements = await newPage.$$eval('#general_information_card', elements => elements.map(element => element.innerText));
161 const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
162 let emails = emailElements.flatMap(element => element.match(emailRegex) || []);
163
164 person.emails = emails.length > 0 ? emails : [''];
165 } catch (error) {
166 console.error(`Error processing ${person.name} at ${cleanedUrl}: ${error}`);
167 person.emails = [''];
168 }
169 }
170
171 const batchSize = 5;
172 for (let i = 0; i < allData.length; i += batchSize) {
173 const batch = allData.slice(i, i + batchSize);
174 console.log(`Processing batch from index ${i} to ${i + batchSize - 1}`);
175
176 await Promise.all(batch.map(async person => {
177 const newPage = await browser.newPage();
178 try {
179 return await processPerson(person, newPage);
180 } catch (error) {
181 console.error(`Error processing ${person.name}: ${error}`);
182 } finally {
183 await newPage.close();
184 }
185 }));
186 console.log(`Completed batch from index ${i} to ${i + batchSize - 1}`);
187 }
188
189 const maxEmails = allData.reduce((max, p) => Math.max(max, p.emails.length), 0);
190 const emailHeaders = Array.from({ length: maxEmails }, (_, i) => `Email ${i + 1}`).join(',');
191 const csvHeader = `First Name,Last Name,Full Name,Job Title,Company Name,Location,Employee Count,Phone,Industry,URL,LinkedIn URL,Facebook URL,Twitter URL,Company LinkedIn URL,Company URL,Keywords,${emailHeaders}\n`;
192
193 const csvRows = allData.map(person => {
194 const paddedEmails = [...person.emails, ...Array(maxEmails - person.emails.length).fill('')];
195 return `${person.firstName},${person.lastName},${person.fullName},${person.jobTitle},${person.companyName},${person.location},${person.employeeCount},${person.phone},${person.industry},${person.firstHref},${person.linkedinUrl},${person.facebookUrl},${person.twitterUrl},${person.companyLinkedinUrl},${person.companyUrl},${person.keywords},${paddedEmails.join(',')}`;
196 }).join('\n');
197
198
199 // Assuming csvRows is a string containing your CSV data
200 const lines = csvRows.split('\n');
201
202 // Skip the first line if it's headers or start with the first line if there are no headers
203 for (let i = 0; i < lines.length; i++) {
204 const line = lines[i];
205
206 // Use a CSV parsing approach that accounts for commas within quotes
207 const fields = parseCSVLine(line);
208
209 // Now, map the parsed fields to your data structure
210 const row = {
211 firstName: fields[0].trim() || '',
212 lastName: fields[1].trim() || '',
213 fullName: fields[2].trim() || '',
214 jobTitle: fields[3].trim() || '',
215 companyName: fields[4].trim() || '',
216 location: fields[5].trim() || '',
217 employeeCount: fields[6].trim() || '',
218 phone: fields[7].trim() || '',
219 industry: fields[8].trim() || '',
220 url: fields[9].trim() || '',
221 linkedinUrl: fields[10].trim() || '',
222 facebookUrl: fields[11].trim() || '',
223 twitterUrl: fields[12].trim() || '',
224 companyLinkedinUrl: fields[13].trim() || '',
225 companyUrl: fields[14].trim() || '',
226 keywords: fields[15].trim() || '',
227 email1: fields[16] ? fields[16].trim() : '',
228 email2: fields[17] ? fields[17].trim() : '',
229 };
230
231 // Make sure Actor.pushData can handle asynchronous operations properly
232 await Actor.pushData(row);
233 }
234
235 // Define a function to parse a single line of a CSV, considering commas within quotes
236 function parseCSVLine(line) {
237 const result = [];
238 let start = 0;
239 let inQuotes = false;
240 for (let i = 0; i < line.length; i++) {
241 if (line[i] === '"' && (i === 0 || line[i - 1] !== '\\')) {
242 inQuotes = !inQuotes;
243 } else if (line[i] === ',' && !inQuotes) {
244 result.push(line.substring(start, i));
245 start = i + 1;
246 }
247 }
248 result.push(line.substring(start)); // Push the last field
249
250 // Remove quotes from the beginning and end of each field
251 return result.map(field => field.replace(/^"|"$/g, ''));
252 }
253
254
255
256 } else {
257 console.log('Element not found');
258 }
259 await browser.close();
260 console.timeEnd("ScriptRunTime");// Your Actor's logic here
261});

src/routes2.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-puppeteer-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"puppeteer": "*"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}