Apollo Easy Scrape 3 avatar

Apollo Easy Scrape 3

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Apollo Easy Scrape 3

Apollo Easy Scrape 3

mikepowers/apollo-easy-scrape-3

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-15",
4    "title": "Project Puppeteer Crawler JavaScript",
5    "description": "Crawlee and Puppeteer project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from Apollo.io",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "apollo_username": {
7            "title": "Apollo Email 📪",
8            "type": "string",
9            "description": "Your Apollo Email",
10            "editor": "textfield"
11        },
12        "apollo_password": {
13            "title": "Apollo Password 🔒",
14            "type": "string",
15            "description": "Your Apollo Password",
16            "editor": "textfield",
17            "isSecret": true
18        },
19        "list_url": {
20            "title": "List URL from Apollo 📜",
21            "type": "string",
22            "description": "The URL to navigate to after login.",
23            "editor": "textfield"
24        }
25    },
26    "required": ["apollo_username", "apollo_password", "list_url"]
27}

src/main.js

1import { Actor } from 'apify';
2import puppeteer from 'puppeteer';
3
4Actor.main(async () => {
5    console.log('Actor starting...');
6
7    // Fetch input using Apify.getInput()
8    const input = await Actor.getInput();
9
10    // Validate input
11    if (!input || !input.apollo_username || !input.apollo_password || !input.list_url) {
12        console.log('Missing required input fields.');
13        return;
14    }
15
16    // Use input values
17    const baseUrl = input.list_url; // Use the list URL from the input
18    const email = input.apollo_username; // Use the Apollo email from the input
19    const password = input.apollo_password; // Use the Apollo password from the input
20
21
22    // Start the Puppeteer browser
23
24    console.time("ScriptRunTime");
25    const browser = await puppeteer.launch({ 
26        args: ['--no-sandbox'],
27        headless: false, // Set to true to run headless
28        defaultViewport: null
29    });
30
31    const page = await browser.newPage();
32    await page.goto('https://app.apollo.io/#/login');
33    await page.waitForSelector('input[name="email"]', { visible: true });
34    await page.waitForSelector('input[name="password"]', { visible: true });
35    await page.type('input[name="email"]', email);
36    await page.type('input[name="password"]', password);
37    await page.click('button[type="submit"]');
38    await new Promise(resolve => setTimeout(resolve, 2000));
39    await page.goto(baseUrl);
40    await new Promise(resolve => setTimeout(resolve, 2000));
41    const totalText = await page.evaluate(() => {
42        const targetElement = Array.from(document.querySelectorAll('a')).find(e => e.textContent.trim().startsWith('Total'));
43        return targetElement ? targetElement.textContent.trim() : null;
44    });
45    
46    let totalItems = 0;
47    if (totalText) {
48        const totalItemsMatch = totalText.match(/\d+/);
49        if (totalItemsMatch) {
50            totalItems = parseInt(totalItemsMatch[0], 10);
51            console.log(`Total items: ${totalItems}`);
52        }
53    }
54    if (totalItems > 0) {
55        const itemsPerPage = 25;
56        const totalPages = Math.ceil(totalItems / itemsPerPage);
57        console.log(`Total pages: ${totalPages}`);
58        let allData = [];
59        for (let i = 1; i <= totalPages; i++) {
60            const pageUrl = `${baseUrl}&page=${i}`;
61            console.log(`Scraping page: ${pageUrl}`);
62            await page.goto(pageUrl);
63            await page.waitForSelector('tbody', { visible: true });
64
65            const data = await page.$$eval('tbody', tbodies => tbodies.map(tbody => {
66                const tr = tbody.querySelector('tr');
67                const tdName = tr ? tr.querySelector('td') : null;
68                let name = tdName ? tdName.innerText.trim() : null;
69                name = name.replace("------", "").trim();
70
71                let parts = name.split(' ');
72                let firstName = parts.shift();
73                let lastName = parts.join(' '); 
74        
75                const quote = (str) => `"${str.replace(/"/g, '""')}"`;
76
77                firstName = quote(firstName);
78                lastName = quote(lastName);
79                fullName = quote(name); 
80        
81                const tdJobTitle = tr ? tr.querySelector('td:nth-child(2)') : null;
82                let jobTitle = tdJobTitle ? tdJobTitle.innerText.trim() : '';
83                jobTitle = quote(jobTitle);
84        
85                const tdCompanyName = tr ? tr.querySelector('td:nth-child(3)') : null;
86                let companyName = tdCompanyName ? tdCompanyName.innerText.trim() : '';
87                companyName = quote(companyName);
88        
89                const tdLocation = tr ? tr.querySelector('td:nth-child(5) .zp_Y6y8d') : null;
90                let location = tdLocation ? tdLocation.innerText.trim() : '';
91                location = quote(location);
92        
93                const tdEmployeeCount = tr ? tr.querySelector('td:nth-child(6)') : null;
94                let employeeCount = tdEmployeeCount ? tdEmployeeCount.innerText.trim() : '';
95                employeeCount = quote(employeeCount);
96        
97                const tdPhone = tr ? tr.querySelector('td:nth-child(7)') : null;
98                let phone = tdPhone ? tdPhone.innerText.trim() : '';
99                phone = phone.replace(/\D/g, ''); 
100                phone = phone.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3'); 
101                phone = quote(phone);
102        
103                const tdIndustry = tr ? tr.querySelector('td:nth-child(8)') : null;
104                let industry = tdIndustry ? tdIndustry.innerText.trim() : '';
105                industry = quote(industry);
106        
107                const tdKeywords = tr ? tr.querySelector('td:nth-child(9)') : null;
108                let keywords = tdKeywords ? tdKeywords.innerText.trim() : '';
109                keywords = quote(keywords);
110        
111                let facebookUrl = '', twitterUrl = '', companyLinkedinUrl = '', companyUrl = '';
112        
113                if (tdCompanyName) {
114                    const links = tdCompanyName.querySelectorAll('a[href]');
115                    links.forEach(link => {
116                        const href = link.href.trim();
117                        if (href.includes('facebook.com')) facebookUrl = quote(href);
118                        if (href.includes('twitter.com')) twitterUrl = quote(href);
119                        else if (href.includes('linkedin.com/company')) companyLinkedinUrl = quote(href);
120                        else if (link.querySelector('.apollo-icon-link')) companyUrl = quote(href);
121                    });
122                }
123        
124                const firstHref = tbody.querySelector('a[href]') ? tbody.querySelector('a[href]').href : '';
125                const linkedinUrl = tdName && tdName.querySelector('a[href*="linkedin.com/in"]') ? tdName.querySelector('a[href*="linkedin.com/in"]').href : '';
126                
127                return { 
128                    firstName: firstName, 
129                    lastName: lastName, 
130                    fullName: fullName,
131                    jobTitle: jobTitle, 
132                    companyName: companyName, 
133                    location: location,
134                    employeeCount: employeeCount, 
135                    phone: phone,
136                    industry: industry, 
137                    firstHref: quote(firstHref), 
138                    linkedinUrl: quote(linkedinUrl),
139                    facebookUrl: facebookUrl, 
140                    twitterUrl: twitterUrl, 
141                    companyLinkedinUrl: companyLinkedinUrl, 
142                    companyUrl: companyUrl,
143                    keywords: keywords,
144                }; 
145            }));
146            allData = allData.concat(data);
147        }  
148        async function processPerson(person, newPage) {
149            console.log(`Processing person: ${person.name}`);
150            const cleanedUrl = person.firstHref.replace(/"/g, '');
151            console.log(`Navigating to cleaned URL: ${cleanedUrl}`);
152        
153            try {
154                await newPage.goto(cleanedUrl, { waitUntil: 'networkidle0' });
155                console.log(`Page navigated to ${cleanedUrl}`);
156        
157                await newPage.waitForSelector('#general_information_card', { timeout: 10000 });
158                console.log(`Found #general_information_card`);
159
160                const emailElements = await newPage.$$eval('#general_information_card', elements => elements.map(element => element.innerText));
161                const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
162                let emails = emailElements.flatMap(element => element.match(emailRegex) || []);
163        
164                person.emails = emails.length > 0 ? emails : ['']; 
165            } catch (error) {
166                console.error(`Error processing ${person.name} at ${cleanedUrl}: ${error}`);
167                person.emails = [''];
168            }
169        }
170        
171        const batchSize = 5; 
172        for (let i = 0; i < allData.length; i += batchSize) {
173            const batch = allData.slice(i, i + batchSize);
174            console.log(`Processing batch from index ${i} to ${i + batchSize - 1}`);
175          
176            await Promise.all(batch.map(async person => { 
177                const newPage = await browser.newPage(); 
178                try {
179                    return await processPerson(person, newPage); 
180                } catch (error) {
181                    console.error(`Error processing ${person.name}: ${error}`);
182                } finally {
183                    await newPage.close(); 
184                }
185            }));
186            console.log(`Completed batch from index ${i} to ${i + batchSize - 1}`);
187        }
188
189        const maxEmails = allData.reduce((max, p) => Math.max(max, p.emails.length), 0);
190        const emailHeaders = Array.from({ length: maxEmails }, (_, i) => `Email ${i + 1}`).join(',');
191        const csvHeader = `First Name,Last Name,Full Name,Job Title,Company Name,Location,Employee Count,Phone,Industry,URL,LinkedIn URL,Facebook URL,Twitter URL,Company LinkedIn URL,Company URL,Keywords,${emailHeaders}\n`;
192
193        const csvRows = allData.map(person => {
194            const paddedEmails = [...person.emails, ...Array(maxEmails - person.emails.length).fill('')];
195            return `${person.firstName},${person.lastName},${person.fullName},${person.jobTitle},${person.companyName},${person.location},${person.employeeCount},${person.phone},${person.industry},${person.firstHref},${person.linkedinUrl},${person.facebookUrl},${person.twitterUrl},${person.companyLinkedinUrl},${person.companyUrl},${person.keywords},${paddedEmails.join(',')}`;
196        }).join('\n');
197
198
199        // Assuming csvRows is a string containing your CSV data
200        const lines = csvRows.split('\n');
201
202        // Skip the first line if it's headers or start with the first line if there are no headers
203        for (let i = 0; i < lines.length; i++) {
204            const line = lines[i];
205
206            // Use a CSV parsing approach that accounts for commas within quotes
207            const fields = parseCSVLine(line);
208
209            // Now, map the parsed fields to your data structure
210            const row = {
211                firstName: fields[0].trim() || '',
212                lastName: fields[1].trim() || '',
213                fullName: fields[2].trim() || '',
214                jobTitle: fields[3].trim() || '',
215                companyName: fields[4].trim() || '',
216                location: fields[5].trim() || '',
217                employeeCount: fields[6].trim() || '',
218                phone: fields[7].trim() || '',
219                industry: fields[8].trim() || '',
220                url: fields[9].trim() || '',
221                linkedinUrl: fields[10].trim() || '',
222                facebookUrl: fields[11].trim() || '',
223                twitterUrl: fields[12].trim() || '',
224                companyLinkedinUrl: fields[13].trim() || '',
225                companyUrl: fields[14].trim() || '',
226                keywords: fields[15].trim() || '',
227                email1: fields[16] ? fields[16].trim() : '',
228                email2: fields[17] ? fields[17].trim() : '',
229            };
230
231            // Make sure Actor.pushData can handle asynchronous operations properly
232            await Actor.pushData(row);
233        }
234
235        // Define a function to parse a single line of a CSV, considering commas within quotes
236        function parseCSVLine(line) {
237            const result = [];
238            let start = 0;
239            let inQuotes = false;
240            for (let i = 0; i < line.length; i++) {
241                if (line[i] === '"' && (i === 0 || line[i - 1] !== '\\')) {
242                    inQuotes = !inQuotes;
243                } else if (line[i] === ',' && !inQuotes) {
244                    result.push(line.substring(start, i));
245                    start = i + 1;
246                }
247            }
248            result.push(line.substring(start)); // Push the last field
249
250            // Remove quotes from the beginning and end of each field
251            return result.map(field => field.replace(/^"|"$/g, ''));
252        }
253
254
255
256    } else {
257        console.log('Element not found');
258    }
259    await browser.close();
260    console.timeEnd("ScriptRunTime");// Your Actor's logic here
261});

src/routes2.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://apify.com/*'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14    const title = await page.title();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await Dataset.pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.1.10",
8        "crawlee": "^3.5.4",
9        "puppeteer": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18    },
19    "author": "It's not you it's me",
20    "license": "ISC"
21}
Developer
Maintained by Community