Apollo Easy Scrape 3
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsApollo Easy Scrape 3
mikepowers/apollo-easy-scrape-3
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor-15",
4 "title": "Project Puppeteer Crawler JavaScript",
5 "description": "Crawlee and Puppeteer project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Scrape data from Apollo.io",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "apollo_username": {
7 "title": "Apollo Email 📪",
8 "type": "string",
9 "description": "Your Apollo Email",
10 "editor": "textfield"
11 },
12 "apollo_password": {
13 "title": "Apollo Password 🔒",
14 "type": "string",
15 "description": "Your Apollo Password",
16 "editor": "textfield",
17 "isSecret": true
18 },
19 "list_url": {
20 "title": "List URL from Apollo 📜",
21 "type": "string",
22 "description": "The URL to navigate to after login.",
23 "editor": "textfield"
24 }
25 },
26 "required": ["apollo_username", "apollo_password", "list_url"]
27}
src/main.js
1import { Actor } from 'apify';
2import puppeteer from 'puppeteer';
3
4Actor.main(async () => {
5 console.log('Actor starting...');
6
7 // Fetch input using Apify.getInput()
8 const input = await Actor.getInput();
9
10 // Validate input
11 if (!input || !input.apollo_username || !input.apollo_password || !input.list_url) {
12 console.log('Missing required input fields.');
13 return;
14 }
15
16 // Use input values
17 const baseUrl = input.list_url; // Use the list URL from the input
18 const email = input.apollo_username; // Use the Apollo email from the input
19 const password = input.apollo_password; // Use the Apollo password from the input
20
21
22 // Start the Puppeteer browser
23
24 console.time("ScriptRunTime");
25 const browser = await puppeteer.launch({
26 args: ['--no-sandbox'],
27 headless: false, // Set to true to run headless
28 defaultViewport: null
29 });
30
31 const page = await browser.newPage();
32 await page.goto('https://app.apollo.io/#/login');
33 await page.waitForSelector('input[name="email"]', { visible: true });
34 await page.waitForSelector('input[name="password"]', { visible: true });
35 await page.type('input[name="email"]', email);
36 await page.type('input[name="password"]', password);
37 await page.click('button[type="submit"]');
38 await new Promise(resolve => setTimeout(resolve, 2000));
39 await page.goto(baseUrl);
40 await new Promise(resolve => setTimeout(resolve, 2000));
41 const totalText = await page.evaluate(() => {
42 const targetElement = Array.from(document.querySelectorAll('a')).find(e => e.textContent.trim().startsWith('Total'));
43 return targetElement ? targetElement.textContent.trim() : null;
44 });
45
46 let totalItems = 0;
47 if (totalText) {
48 const totalItemsMatch = totalText.match(/\d+/);
49 if (totalItemsMatch) {
50 totalItems = parseInt(totalItemsMatch[0], 10);
51 console.log(`Total items: ${totalItems}`);
52 }
53 }
54 if (totalItems > 0) {
55 const itemsPerPage = 25;
56 const totalPages = Math.ceil(totalItems / itemsPerPage);
57 console.log(`Total pages: ${totalPages}`);
58 let allData = [];
59 for (let i = 1; i <= totalPages; i++) {
60 const pageUrl = `${baseUrl}&page=${i}`;
61 console.log(`Scraping page: ${pageUrl}`);
62 await page.goto(pageUrl);
63 await page.waitForSelector('tbody', { visible: true });
64
65 const data = await page.$$eval('tbody', tbodies => tbodies.map(tbody => {
66 const tr = tbody.querySelector('tr');
67 const tdName = tr ? tr.querySelector('td') : null;
68 let name = tdName ? tdName.innerText.trim() : null;
69 name = name.replace("------", "").trim();
70
71 let parts = name.split(' ');
72 let firstName = parts.shift();
73 let lastName = parts.join(' ');
74
75 const quote = (str) => `"${str.replace(/"/g, '""')}"`;
76
77 firstName = quote(firstName);
78 lastName = quote(lastName);
79 fullName = quote(name);
80
81 const tdJobTitle = tr ? tr.querySelector('td:nth-child(2)') : null;
82 let jobTitle = tdJobTitle ? tdJobTitle.innerText.trim() : '';
83 jobTitle = quote(jobTitle);
84
85 const tdCompanyName = tr ? tr.querySelector('td:nth-child(3)') : null;
86 let companyName = tdCompanyName ? tdCompanyName.innerText.trim() : '';
87 companyName = quote(companyName);
88
89 const tdLocation = tr ? tr.querySelector('td:nth-child(5) .zp_Y6y8d') : null;
90 let location = tdLocation ? tdLocation.innerText.trim() : '';
91 location = quote(location);
92
93 const tdEmployeeCount = tr ? tr.querySelector('td:nth-child(6)') : null;
94 let employeeCount = tdEmployeeCount ? tdEmployeeCount.innerText.trim() : '';
95 employeeCount = quote(employeeCount);
96
97 const tdPhone = tr ? tr.querySelector('td:nth-child(7)') : null;
98 let phone = tdPhone ? tdPhone.innerText.trim() : '';
99 phone = phone.replace(/\D/g, '');
100 phone = phone.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
101 phone = quote(phone);
102
103 const tdIndustry = tr ? tr.querySelector('td:nth-child(8)') : null;
104 let industry = tdIndustry ? tdIndustry.innerText.trim() : '';
105 industry = quote(industry);
106
107 const tdKeywords = tr ? tr.querySelector('td:nth-child(9)') : null;
108 let keywords = tdKeywords ? tdKeywords.innerText.trim() : '';
109 keywords = quote(keywords);
110
111 let facebookUrl = '', twitterUrl = '', companyLinkedinUrl = '', companyUrl = '';
112
113 if (tdCompanyName) {
114 const links = tdCompanyName.querySelectorAll('a[href]');
115 links.forEach(link => {
116 const href = link.href.trim();
117 if (href.includes('facebook.com')) facebookUrl = quote(href);
118 if (href.includes('twitter.com')) twitterUrl = quote(href);
119 else if (href.includes('linkedin.com/company')) companyLinkedinUrl = quote(href);
120 else if (link.querySelector('.apollo-icon-link')) companyUrl = quote(href);
121 });
122 }
123
124 const firstHref = tbody.querySelector('a[href]') ? tbody.querySelector('a[href]').href : '';
125 const linkedinUrl = tdName && tdName.querySelector('a[href*="linkedin.com/in"]') ? tdName.querySelector('a[href*="linkedin.com/in"]').href : '';
126
127 return {
128 firstName: firstName,
129 lastName: lastName,
130 fullName: fullName,
131 jobTitle: jobTitle,
132 companyName: companyName,
133 location: location,
134 employeeCount: employeeCount,
135 phone: phone,
136 industry: industry,
137 firstHref: quote(firstHref),
138 linkedinUrl: quote(linkedinUrl),
139 facebookUrl: facebookUrl,
140 twitterUrl: twitterUrl,
141 companyLinkedinUrl: companyLinkedinUrl,
142 companyUrl: companyUrl,
143 keywords: keywords,
144 };
145 }));
146 allData = allData.concat(data);
147 }
148 async function processPerson(person, newPage) {
149 console.log(`Processing person: ${person.name}`);
150 const cleanedUrl = person.firstHref.replace(/"/g, '');
151 console.log(`Navigating to cleaned URL: ${cleanedUrl}`);
152
153 try {
154 await newPage.goto(cleanedUrl, { waitUntil: 'networkidle0' });
155 console.log(`Page navigated to ${cleanedUrl}`);
156
157 await newPage.waitForSelector('#general_information_card', { timeout: 10000 });
158 console.log(`Found #general_information_card`);
159
160 const emailElements = await newPage.$$eval('#general_information_card', elements => elements.map(element => element.innerText));
161 const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
162 let emails = emailElements.flatMap(element => element.match(emailRegex) || []);
163
164 person.emails = emails.length > 0 ? emails : [''];
165 } catch (error) {
166 console.error(`Error processing ${person.name} at ${cleanedUrl}: ${error}`);
167 person.emails = [''];
168 }
169 }
170
171 const batchSize = 5;
172 for (let i = 0; i < allData.length; i += batchSize) {
173 const batch = allData.slice(i, i + batchSize);
174 console.log(`Processing batch from index ${i} to ${i + batchSize - 1}`);
175
176 await Promise.all(batch.map(async person => {
177 const newPage = await browser.newPage();
178 try {
179 return await processPerson(person, newPage);
180 } catch (error) {
181 console.error(`Error processing ${person.name}: ${error}`);
182 } finally {
183 await newPage.close();
184 }
185 }));
186 console.log(`Completed batch from index ${i} to ${i + batchSize - 1}`);
187 }
188
189 const maxEmails = allData.reduce((max, p) => Math.max(max, p.emails.length), 0);
190 const emailHeaders = Array.from({ length: maxEmails }, (_, i) => `Email ${i + 1}`).join(',');
191 const csvHeader = `First Name,Last Name,Full Name,Job Title,Company Name,Location,Employee Count,Phone,Industry,URL,LinkedIn URL,Facebook URL,Twitter URL,Company LinkedIn URL,Company URL,Keywords,${emailHeaders}\n`;
192
193 const csvRows = allData.map(person => {
194 const paddedEmails = [...person.emails, ...Array(maxEmails - person.emails.length).fill('')];
195 return `${person.firstName},${person.lastName},${person.fullName},${person.jobTitle},${person.companyName},${person.location},${person.employeeCount},${person.phone},${person.industry},${person.firstHref},${person.linkedinUrl},${person.facebookUrl},${person.twitterUrl},${person.companyLinkedinUrl},${person.companyUrl},${person.keywords},${paddedEmails.join(',')}`;
196 }).join('\n');
197
198
199 // Assuming csvRows is a string containing your CSV data
200 const lines = csvRows.split('\n');
201
202 // Skip the first line if it's headers or start with the first line if there are no headers
203 for (let i = 0; i < lines.length; i++) {
204 const line = lines[i];
205
206 // Use a CSV parsing approach that accounts for commas within quotes
207 const fields = parseCSVLine(line);
208
209 // Now, map the parsed fields to your data structure
210 const row = {
211 firstName: fields[0].trim() || '',
212 lastName: fields[1].trim() || '',
213 fullName: fields[2].trim() || '',
214 jobTitle: fields[3].trim() || '',
215 companyName: fields[4].trim() || '',
216 location: fields[5].trim() || '',
217 employeeCount: fields[6].trim() || '',
218 phone: fields[7].trim() || '',
219 industry: fields[8].trim() || '',
220 url: fields[9].trim() || '',
221 linkedinUrl: fields[10].trim() || '',
222 facebookUrl: fields[11].trim() || '',
223 twitterUrl: fields[12].trim() || '',
224 companyLinkedinUrl: fields[13].trim() || '',
225 companyUrl: fields[14].trim() || '',
226 keywords: fields[15].trim() || '',
227 email1: fields[16] ? fields[16].trim() : '',
228 email2: fields[17] ? fields[17].trim() : '',
229 };
230
231 // Make sure Actor.pushData can handle asynchronous operations properly
232 await Actor.pushData(row);
233 }
234
235 // Define a function to parse a single line of a CSV, considering commas within quotes
236 function parseCSVLine(line) {
237 const result = [];
238 let start = 0;
239 let inQuotes = false;
240 for (let i = 0; i < line.length; i++) {
241 if (line[i] === '"' && (i === 0 || line[i - 1] !== '\\')) {
242 inQuotes = !inQuotes;
243 } else if (line[i] === ',' && !inQuotes) {
244 result.push(line.substring(start, i));
245 start = i + 1;
246 }
247 }
248 result.push(line.substring(start)); // Push the last field
249
250 // Remove quotes from the beginning and end of each field
251 return result.map(field => field.replace(/^"|"$/g, ''));
252 }
253
254
255
256 } else {
257 console.log('Element not found');
258 }
259 await browser.close();
260 console.timeEnd("ScriptRunTime");// Your Actor's logic here
261});
src/routes2.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-puppeteer-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.1.10",
8 "crawlee": "^3.5.4",
9 "puppeteer": "*"
10 },
11 "devDependencies": {
12 "@apify/eslint-config": "^0.4.0",
13 "eslint": "^8.50.0"
14 },
15 "scripts": {
16 "start": "node src/main.js",
17 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18 },
19 "author": "It's not you it's me",
20 "license": "ISC"
21}
Developer
Maintained by Community
Categories