
Apollo Easy Scrape 3
Deprecated
Pricing
Pay per usage
Go to Store

Apollo Easy Scrape 3
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
181
Monthly users
44
Last modified
a year ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-15", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from Apollo.io", "type": "object", "schemaVersion": 1, "properties": { "apollo_username": { "title": "Apollo Email 📪", "type": "string", "description": "Your Apollo Email", "editor": "textfield" }, "apollo_password": { "title": "Apollo Password 🔒", "type": "string", "description": "Your Apollo Password", "editor": "textfield", "isSecret": true }, "list_url": { "title": "List URL from Apollo 📜", "type": "string", "description": "The URL to navigate to after login.", "editor": "textfield" } }, "required": ["apollo_username", "apollo_password", "list_url"]}
src/main.js
1import { Actor } from 'apify';2import puppeteer from 'puppeteer';3
4Actor.main(async () => {5 console.log('Actor starting...');6
7 // Fetch input using Apify.getInput()8 const input = await Actor.getInput();9
10 // Validate input11 if (!input || !input.apollo_username || !input.apollo_password || !input.list_url) {12 console.log('Missing required input fields.');13 return;14 }15
16 // Use input values17 const baseUrl = input.list_url; // Use the list URL from the input18 const email = input.apollo_username; // Use the Apollo email from the input19 const password = input.apollo_password; // Use the Apollo password from the input20
21
22 // Start the Puppeteer browser23
24 console.time("ScriptRunTime");25 const browser = await puppeteer.launch({ 26 args: ['--no-sandbox'],27 headless: false, // Set to true to run headless28 defaultViewport: null29 });30
31 const page = await browser.newPage();32 await page.goto('https://app.apollo.io/#/login');33 await page.waitForSelector('input[name="email"]', { visible: true });34 await page.waitForSelector('input[name="password"]', { visible: true });35 await page.type('input[name="email"]', email);36 await page.type('input[name="password"]', password);37 await page.click('button[type="submit"]');38 await new Promise(resolve => setTimeout(resolve, 2000));39 await page.goto(baseUrl);40 await new Promise(resolve => setTimeout(resolve, 2000));41 const totalText = await page.evaluate(() => {42 const targetElement = Array.from(document.querySelectorAll('a')).find(e => e.textContent.trim().startsWith('Total'));43 return targetElement ? targetElement.textContent.trim() : null;44 });45 46 let totalItems = 0;47 if (totalText) {48 const totalItemsMatch = totalText.match(/\d+/);49 if (totalItemsMatch) {50 totalItems = parseInt(totalItemsMatch[0], 10);51 console.log(`Total items: ${totalItems}`);52 }53 }54 if (totalItems > 0) {55 const itemsPerPage = 25;56 const totalPages = Math.ceil(totalItems / itemsPerPage);57 console.log(`Total pages: ${totalPages}`);58 let allData = [];59 for (let i = 1; i <= totalPages; i++) {60 const pageUrl = `${baseUrl}&page=${i}`;61 console.log(`Scraping page: ${pageUrl}`);62 await page.goto(pageUrl);63 await page.waitForSelector('tbody', { visible: true });64
65 const data = await page.$$eval('tbody', tbodies => tbodies.map(tbody => {66 const tr = tbody.querySelector('tr');67 const tdName = tr ? tr.querySelector('td') : null;68 let name = tdName ? tdName.innerText.trim() : null;69 name = name.replace("------", "").trim();70
71 let parts = name.split(' ');72 let firstName = parts.shift();73 let lastName = parts.join(' '); 74 75 const quote = (str) => `"${str.replace(/"/g, '""')}"`;76
77 firstName = quote(firstName);78 lastName = quote(lastName);79 fullName = quote(name); 80 81 const tdJobTitle = tr ? tr.querySelector('td:nth-child(2)') : null;82 let jobTitle = tdJobTitle ? tdJobTitle.innerText.trim() : '';83 jobTitle = quote(jobTitle);84 85 const tdCompanyName = tr ? tr.querySelector('td:nth-child(3)') : null;86 let companyName = tdCompanyName ? tdCompanyName.innerText.trim() : '';87 companyName = quote(companyName);88 89 const tdLocation = tr ? tr.querySelector('td:nth-child(5) .zp_Y6y8d') : null;90 let location = tdLocation ? tdLocation.innerText.trim() : '';91 location = quote(location);92 93 const tdEmployeeCount = tr ? tr.querySelector('td:nth-child(6)') : null;94 let employeeCount = tdEmployeeCount ? tdEmployeeCount.innerText.trim() : '';95 employeeCount = quote(employeeCount);96 97 const tdPhone = tr ? tr.querySelector('td:nth-child(7)') : null;98 let phone = tdPhone ? tdPhone.innerText.trim() : '';99 phone = phone.replace(/\D/g, ''); 100 phone = phone.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3'); 101 phone = quote(phone);102 103 const tdIndustry = tr ? tr.querySelector('td:nth-child(8)') : null;104 let industry = tdIndustry ? tdIndustry.innerText.trim() : '';105 industry = quote(industry);106 107 const tdKeywords = tr ? tr.querySelector('td:nth-child(9)') : null;108 let keywords = tdKeywords ? tdKeywords.innerText.trim() : '';109 keywords = quote(keywords);110 111 let facebookUrl = '', twitterUrl = '', companyLinkedinUrl = '', companyUrl = '';112 113 if (tdCompanyName) {114 const links = tdCompanyName.querySelectorAll('a[href]');115 links.forEach(link => {116 const href = link.href.trim();117 if (href.includes('facebook.com')) facebookUrl = quote(href);118 if (href.includes('twitter.com')) twitterUrl = quote(href);119 else if (href.includes('linkedin.com/company')) companyLinkedinUrl = quote(href);120 else if (link.querySelector('.apollo-icon-link')) companyUrl = quote(href);121 });122 }123 124 const firstHref = tbody.querySelector('a[href]') ? tbody.querySelector('a[href]').href : '';125 const linkedinUrl = tdName && tdName.querySelector('a[href*="linkedin.com/in"]') ? tdName.querySelector('a[href*="linkedin.com/in"]').href : '';126 127 return { 128 firstName: firstName, 129 lastName: lastName, 130 fullName: fullName,131 jobTitle: jobTitle, 132 companyName: companyName, 133 location: location,134 employeeCount: employeeCount, 135 phone: phone,136 industry: industry, 137 firstHref: quote(firstHref), 138 linkedinUrl: quote(linkedinUrl),139 facebookUrl: facebookUrl, 140 twitterUrl: twitterUrl, 141 companyLinkedinUrl: companyLinkedinUrl, 142 companyUrl: companyUrl,143 keywords: keywords,144 }; 145 }));146 allData = allData.concat(data);147 } 148 async function processPerson(person, newPage) {149 console.log(`Processing person: ${person.name}`);150 const cleanedUrl = person.firstHref.replace(/"/g, '');151 console.log(`Navigating to cleaned URL: ${cleanedUrl}`);152 153 try {154 await newPage.goto(cleanedUrl, { waitUntil: 'networkidle0' });155 console.log(`Page navigated to ${cleanedUrl}`);156 157 await newPage.waitForSelector('#general_information_card', { timeout: 10000 });158 console.log(`Found #general_information_card`);159
160 const emailElements = await newPage.$$eval('#general_information_card', elements => elements.map(element => element.innerText));161 const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;162 let emails = emailElements.flatMap(element => element.match(emailRegex) || []);163 164 person.emails = emails.length > 0 ? emails : ['']; 165 } catch (error) {166 console.error(`Error processing ${person.name} at ${cleanedUrl}: ${error}`);167 person.emails = [''];168 }169 }170 171 const batchSize = 5; 172 for (let i = 0; i < allData.length; i += batchSize) {173 const batch = allData.slice(i, i + batchSize);174 console.log(`Processing batch from index ${i} to ${i + batchSize - 1}`);175 176 await Promise.all(batch.map(async person => { 177 const newPage = await browser.newPage(); 178 try {179 return await processPerson(person, newPage); 180 } catch (error) {181 console.error(`Error processing ${person.name}: ${error}`);182 } finally {183 await newPage.close(); 184 }185 }));186 console.log(`Completed batch from index ${i} to ${i + batchSize - 1}`);187 }188
189 const maxEmails = allData.reduce((max, p) => Math.max(max, p.emails.length), 0);190 const emailHeaders = Array.from({ length: maxEmails }, (_, i) => `Email ${i + 1}`).join(',');191 const csvHeader = `First Name,Last Name,Full Name,Job Title,Company Name,Location,Employee Count,Phone,Industry,URL,LinkedIn URL,Facebook URL,Twitter URL,Company LinkedIn URL,Company URL,Keywords,${emailHeaders}\n`;192
193 const csvRows = allData.map(person => {194 const paddedEmails = [...person.emails, ...Array(maxEmails - person.emails.length).fill('')];195 return `${person.firstName},${person.lastName},${person.fullName},${person.jobTitle},${person.companyName},${person.location},${person.employeeCount},${person.phone},${person.industry},${person.firstHref},${person.linkedinUrl},${person.facebookUrl},${person.twitterUrl},${person.companyLinkedinUrl},${person.companyUrl},${person.keywords},${paddedEmails.join(',')}`;196 }).join('\n');197
198
199 // Assuming csvRows is a string containing your CSV data200 const lines = csvRows.split('\n');201
202 // Skip the first line if it's headers or start with the first line if there are no headers203 for (let i = 0; i < lines.length; i++) {204 const line = lines[i];205
206 // Use a CSV parsing approach that accounts for commas within quotes207 const fields = parseCSVLine(line);208
209 // Now, map the parsed fields to your data structure210 const row = {211 firstName: fields[0].trim() || '',212 lastName: fields[1].trim() || '',213 fullName: fields[2].trim() || '',214 jobTitle: fields[3].trim() || '',215 companyName: fields[4].trim() || '',216 location: fields[5].trim() || '',217 employeeCount: fields[6].trim() || '',218 phone: fields[7].trim() || '',219 industry: fields[8].trim() || '',220 url: fields[9].trim() || '',221 linkedinUrl: fields[10].trim() || '',222 facebookUrl: fields[11].trim() || '',223 twitterUrl: fields[12].trim() || '',224 companyLinkedinUrl: fields[13].trim() || '',225 companyUrl: fields[14].trim() || '',226 keywords: fields[15].trim() || '',227 email1: fields[16] ? fields[16].trim() : '',228 email2: fields[17] ? fields[17].trim() : '',229 };230
231 // Make sure Actor.pushData can handle asynchronous operations properly232 await Actor.pushData(row);233 }234
235 // Define a function to parse a single line of a CSV, considering commas within quotes236 function parseCSVLine(line) {237 const result = [];238 let start = 0;239 let inQuotes = false;240 for (let i = 0; i < line.length; i++) {241 if (line[i] === '"' && (i === 0 || line[i - 1] !== '\\')) {242 inQuotes = !inQuotes;243 } else if (line[i] === ',' && !inQuotes) {244 result.push(line.substring(start, i));245 start = i + 1;246 }247 }248 result.push(line.substring(start)); // Push the last field249
250 // Remove quotes from the beginning and end of each field251 return result.map(field => field.replace(/^"|"$/g, ''));252 }253
254
255
256 } else {257 console.log('Element not found');258 }259 await browser.close();260 console.timeEnd("ScriptRunTime");// Your Actor's logic here261});
src/routes2.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://apify.com/*'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 const title = await page.title();15 log.info(`${title}`, { url: request.loadedUrl });16
17 await Dataset.pushData({18 url: request.loadedUrl,19 title,20 });21});
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-puppeteer-javascript", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "dependencies": { "apify": "^3.1.10", "crawlee": "^3.5.4", "puppeteer": "*" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0" }, "scripts": { "start": "node src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}