Apollo Easy Scrape 2
Deprecated
Pricing
Pay per usage
Go to Store
Apollo Easy Scrape 2
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
246
Monthly users
82
Last modified
a year ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "apollo-easy-scraper", "title": "Scrape leads from Apollo.io", "description": "The easiest way to scrape thousands of real leads from Apollo.io with emails.", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from Apollo.io", "type": "object", "schemaVersion": 1, "properties": { "apollo_username": { "title": "Apollo Email 📪", "type": "string", "description": "Your Apollo Email", "editor": "textfield" }, "apollo_password": { "title": "Apollo Password 🔒", "type": "string", "description": "Your Apollo Password", "editor": "textfield", "isSecret": true }, "list_url": { "title": "List URL from Apollo 📜", "type": "string", "description": "The URL to navigate to after login.", "editor": "textfield" } }, "required": ["apollo_username", "apollo_password", "list_url"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1from bs4 import BeautifulSoup2from apify import Actor3import time4import re5from selenium import webdriver6from selenium.webdriver.chrome.options import Options as ChromeOptions7from selenium.webdriver.support import expected_conditions as EC8from selenium.webdriver.support.ui import WebDriverWait9from selenium.webdriver.common.keys import Keys10from selenium.webdriver.common.by import By11from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException 12
13def find_email_address(element):14 email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'15 emails = re.findall(email_pattern, element.get_attribute('innerHTML'))16 return [email for email in emails if '@sentry.io' not in email]17
18async def main():19 async with Actor:20 # Read the Actor input21 actor_input = await Actor.get_input() or {}22 apollo_login_url = 'https://app.apollo.io/#/login'23 apollo_username = actor_input.get('apollo_username')24 apollo_password = actor_input.get('apollo_password')25
26 # Get the list URL from the actor input27 list_url = actor_input.get('list_url')28
29 if not apollo_username or not apollo_password or not list_url:30 Actor.log.error('Apollo credentials or list URL not provided in actor input.')31 return32
33 # Launch Selenium Chrome WebDriver34 chrome_options = ChromeOptions()35 if Actor.config.headless:36 chrome_options.add_argument('--headless')37 chrome_options.add_argument('--no-sandbox')38 chrome_options.add_argument('--disable-gpu')39 chrome_options.add_argument('--disable-dev-shm-usage')40
41 # Disable images42 prefs = {"profile.managed_default_content_settings.images": 2}43 chrome_options.add_experimental_option("prefs", prefs)44
45 driver = webdriver.Chrome(options=chrome_options)46
47 # Perform login48 try:49 driver.get(apollo_login_url)50 time.sleep(1)51 email_input = driver.find_element(By.NAME, "email")52 email_input.send_keys(apollo_username)53 password_input = driver.find_element(By.ID, "current-password")54 password_input.send_keys(apollo_password)55 login_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")56 login_button.click()57 time.sleep(1)58 Actor.log.info('Successfully logged into Apollo account.')59
60 # Navigate to the list URL61 driver.get(list_url)62 time.sleep(1) 63 Actor.log.info('Successfully at the list! Ready to scrape!')64
65 # Find tbodies and process each one66 time.sleep(1) 67
68 all_data = []69
70 if 'page=' not in list_url:71 list_url += '&page=1'72
73 base_url, current_page = re.match(r"(.*page=)(\d+)", list_url).groups()74 current_page = int(current_page)75
76
77 while True: 78
79 current_url = driver.current_url80 Actor.log.info(f"Current URL: {current_url}")81
82 refresh_occurred = False83
84 page_data = []85
86 # Refresh the driver every 20 pages87 if current_page % 20 == 0:88 driver.refresh()89 Actor.log.info('Refreshing driver after 20 pages.')90 91 # Wait until tbodies are present on the page92 try:93 WebDriverWait(driver, 30).until(94 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))95 )96 except TimeoutException:97 Actor.log.error("Timed out 30 seconds waiting for tbodies to be present on the page.")98 driver.refresh()99 try:100 WebDriverWait(driver, 10).until(101 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))102 )103 except TimeoutException:104 Actor.log.error("Timed out likely finished!")105 break106 try:107 tbodies = driver.find_elements(By.CSS_SELECTOR, "tbody")108 except StaleElementReferenceException:109 driver.refresh()110 Actor.log.info('Stale Element Refresh Happening!')111 refresh_occurred = True112 break113
114
115 Actor.log.info(f'On page {current_page} with {len(tbodies)} contacts found.')116 117 for tbody in tbodies:118
119 person_data = {}120
121 # Get the name122 soup = BeautifulSoup(tbody.get_attribute('outerHTML'), 'html.parser')123 first_a_element = soup.find('a')124 name_text = first_a_element.text.replace('------', '') if first_a_element else ''125 name_parts = name_text.split()126 first_name = name_parts[0] if name_parts else ''127 last_name = name_parts[-1] if len(name_parts) > 1 else ''128 Actor.log.info(f'{name_text} is being added to the list!')129 130 # Get the Personal LinkedIn URL131 first_tr = soup.find('tr')132 first_td = first_tr.find('td') if first_tr else None133 linkedin_element = first_td.find('a', href=lambda href: href and 'linkedin.com' in href) if first_td else None134 linkedin_url = linkedin_element['href'] if linkedin_element else ''135
136 # Get the Job Title137 tds = first_tr.find_all('td') if first_tr else []138 second_td = tds[1] if len(tds) > 1 else None139 job_title_element = second_td.find('span') if second_td else None140 job_title = job_title_element.text if job_title_element else ''141
142 # Get the Company LinkedIn, Twitter, Facebook, and Company URLs and Company Name143 third_td = tds[2] if len(tds) > 2 else None144 social_links = {145 'linkedin': '',146 'twitter': '',147 'facebook': '',148 'company': ''149 }150 company_name = ''151 links = third_td.find_all('a') if third_td else []152 for link in links:153 href = link['href']154 if 'linkedin.com' in href:155 social_links['linkedin'] = href156 elif 'twitter.com' in href or 'x.com' in href:157 social_links['twitter'] = href158 elif 'facebook.com' in href:159 social_links['facebook'] = href160 elif link.find('i', class_='apollo-icon-link'):161 social_links['company'] = href162 if not company_name:163 company_name = link.text164
165 company_linkedin_url = social_links['linkedin']166 company_twitter_url = social_links['twitter']167 company_facebook_url = social_links['facebook']168 company_url = social_links['company']169
170 # Get the Location171 fifth_td = tds[4] if len(tds) > 4 else None172 location_element = fifth_td.find('span') if fifth_td else None173 location = location_element.text if location_element else ''174 175 # Get the Employee Count176 sixth_td = tds[5] if len(tds) > 5 else None177 employee_count_element = sixth_td.find('span') if sixth_td else None178 employee = employee_count_element.text if employee_count_element else ''179
180 # Get the Industries181 eighth_td = tds[7] if len(tds) > 7 else None182 industry_elements = eighth_td.find_all('span') if eighth_td else []183 industries = ' '.join(set(industry.text.strip() for industry in industry_elements))184
185 # Get the Keywords186 ninth_td = tds[8] if len(tds) > 8 else None187 keywords_elements = ninth_td.find_all('span') if ninth_td else []188 keywords = ' '.join(set(keyword.text for keyword in keywords_elements if keyword.text.strip()))189
190 Actor.log.info('Finding Button!')191 # Existing try-except block for email collection192 try:193 try:194 button = tbody.find_element(By.CLASS_NAME, "apollo-icon-caret-down-small")195 except StaleElementReferenceException:196 driver.refresh()197 Actor.log.info('Stale Element Refresh Happening!')198 refresh_occurred = True199 break200 201 Actor.log.info('Found Button!')202 try:203 button.click()204 Actor.log.info('Click to Open!')205 except ElementClickInterceptedException:206 driver.refresh()207 Actor.log.info('Broken Button Restarting Page!')208 refresh_occurred = True209 break210 except StaleElementReferenceException:211 driver.refresh()212 Actor.log.info('Stale Element Refresh Happening!')213 refresh_occurred = True214 break215 216
217 dialog = driver.find_element(By.CSS_SELECTOR, "div[role='dialog']")218 email_addresses = find_email_address(dialog)219 Actor.log.info('All is still well!')220 email_field = 'Email'221 if email_addresses:222 person_data[email_field] = email_addresses[0]223 for i, email in enumerate(email_addresses[1:], start=2):224 person_data[f'{email_field}_{i}'] = email225 try:226 button.click()227 Actor.log.info('Click to Close!')228 except ElementClickInterceptedException:229 body_element = driver.find_element(By.TAG_NAME, 'body')230 body_element.send_keys(Keys.ESCAPE)231 Actor.log.info('Oh No! Apollo Sucks!')232 else:233 person_data[email_field] = ''234
235 except NoSuchElementException:236 Actor.log.info('Button was not found!')237 person_data['Email'] = ''238
239 # Populate the rest of person_data240 person_data.update({241 'First Name': first_name,242 'Last Name': last_name,243 'Full Name': name_text,244 'Personal LinkedIn': linkedin_url,245 'Company LinkedIn': company_linkedin_url,246 'Company Twitter': company_twitter_url,247 'Company Facebook': company_facebook_url,248 'Title': job_title,249 'Company URL': company_url,250 'Company Name': company_name,251 'Location': location,252 'Employee Count': employee,253 'Industries': industries,254 'Keywords': keywords255 })256
257 Actor.log.info(person_data)258
259 page_data.append(person_data)260
261 if refresh_occurred:262 continue263
264 all_data.extend(page_data)265
266 # If less than 25 tbodies, break the loop after processing the current page267 if len(tbodies) < 25:268 break269
270 # Increment the page number271 current_page += 1272 next_page_url = f"{base_url}{current_page}"273 try:274 driver.get(next_page_url)275 Actor.log.info(f'Navigated to page {current_page}.')276 except Exception as e:277 Actor.log.error(f'Error navigating to page {current_page}: {str(e)}')278 try:279 driver.refresh()280 driver.get(next_page_url)281 WebDriverWait(driver, 30).until(282 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))283 )284 Actor.log.info(f'Refreshed and navigated to page {current_page}.')285 except Exception as e:286 Actor.log.error(f'Error refreshing and navigating to page {current_page}, likely the end.')287 break288
289 # Save everything290 await Actor.push_data(all_data) 291 Actor.log.info('Your Leads Are Ready!')292 293 except Exception as e:294 Actor.log.error(f'Error during login or scraping: {str(e)}')295 finally:296 driver.quit()297
298if __name__ == "__main__":299 Actor.run(main)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify ~= 1.5.15selenium ~= 4.14.0