Apollo Easy Scrape 2
Deprecated
Pricing
Pay per usage
Go to Store
Apollo Easy Scrape 2
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
246
Monthly users
82
Last modified
a year ago
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-selenium:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "apollo-easy-scraper",
4 "title": "Scrape leads from Apollo.io",
5 "description": "The easiest way to scrape thousands of real leads from Apollo.io with emails.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "python-selenium"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Scrape data from Apollo.io",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "apollo_username": {
7 "title": "Apollo Email 📪",
8 "type": "string",
9 "description": "Your Apollo Email",
10 "editor": "textfield"
11 },
12 "apollo_password": {
13 "title": "Apollo Password 🔒",
14 "type": "string",
15 "description": "Your Apollo Password",
16 "editor": "textfield",
17 "isSecret": true
18 },
19 "list_url": {
20 "title": "List URL from Apollo 📜",
21 "type": "string",
22 "description": "The URL to navigate to after login.",
23 "editor": "textfield"
24 }
25 },
26 "required": ["apollo_username", "apollo_password", "list_url"]
27}
src/__main__.py
1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())
src/main.py
1from bs4 import BeautifulSoup
2from apify import Actor
3import time
4import re
5from selenium import webdriver
6from selenium.webdriver.chrome.options import Options as ChromeOptions
7from selenium.webdriver.support import expected_conditions as EC
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.common.keys import Keys
10from selenium.webdriver.common.by import By
11from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException
12
13def find_email_address(element):
14 email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
15 emails = re.findall(email_pattern, element.get_attribute('innerHTML'))
16 return [email for email in emails if '@sentry.io' not in email]
17
18async def main():
19 async with Actor:
20 # Read the Actor input
21 actor_input = await Actor.get_input() or {}
22 apollo_login_url = 'https://app.apollo.io/#/login'
23 apollo_username = actor_input.get('apollo_username')
24 apollo_password = actor_input.get('apollo_password')
25
26 # Get the list URL from the actor input
27 list_url = actor_input.get('list_url')
28
29 if not apollo_username or not apollo_password or not list_url:
30 Actor.log.error('Apollo credentials or list URL not provided in actor input.')
31 return
32
33 # Launch Selenium Chrome WebDriver
34 chrome_options = ChromeOptions()
35 if Actor.config.headless:
36 chrome_options.add_argument('--headless')
37 chrome_options.add_argument('--no-sandbox')
38 chrome_options.add_argument('--disable-gpu')
39 chrome_options.add_argument('--disable-dev-shm-usage')
40
41 # Disable images
42 prefs = {"profile.managed_default_content_settings.images": 2}
43 chrome_options.add_experimental_option("prefs", prefs)
44
45 driver = webdriver.Chrome(options=chrome_options)
46
47 # Perform login
48 try:
49 driver.get(apollo_login_url)
50 time.sleep(1)
51 email_input = driver.find_element(By.NAME, "email")
52 email_input.send_keys(apollo_username)
53 password_input = driver.find_element(By.ID, "current-password")
54 password_input.send_keys(apollo_password)
55 login_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
56 login_button.click()
57 time.sleep(1)
58 Actor.log.info('Successfully logged into Apollo account.')
59
60 # Navigate to the list URL
61 driver.get(list_url)
62 time.sleep(1)
63 Actor.log.info('Successfully at the list! Ready to scrape!')
64
65 # Find tbodies and process each one
66 time.sleep(1)
67
68 all_data = []
69
70 if 'page=' not in list_url:
71 list_url += '&page=1'
72
73 base_url, current_page = re.match(r"(.*page=)(\d+)", list_url).groups()
74 current_page = int(current_page)
75
76
77 while True:
78
79 current_url = driver.current_url
80 Actor.log.info(f"Current URL: {current_url}")
81
82 refresh_occurred = False
83
84 page_data = []
85
86 # Refresh the driver every 20 pages
87 if current_page % 20 == 0:
88 driver.refresh()
89 Actor.log.info('Refreshing driver after 20 pages.')
90
91 # Wait until tbodies are present on the page
92 try:
93 WebDriverWait(driver, 30).until(
94 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
95 )
96 except TimeoutException:
97 Actor.log.error("Timed out 30 seconds waiting for tbodies to be present on the page.")
98 driver.refresh()
99 try:
100 WebDriverWait(driver, 10).until(
101 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
102 )
103 except TimeoutException:
104 Actor.log.error("Timed out likely finished!")
105 break
106 try:
107 tbodies = driver.find_elements(By.CSS_SELECTOR, "tbody")
108 except StaleElementReferenceException:
109 driver.refresh()
110 Actor.log.info('Stale Element Refresh Happening!')
111 refresh_occurred = True
112 break
113
114
115 Actor.log.info(f'On page {current_page} with {len(tbodies)} contacts found.')
116
117 for tbody in tbodies:
118
119 person_data = {}
120
121 # Get the name
122 soup = BeautifulSoup(tbody.get_attribute('outerHTML'), 'html.parser')
123 first_a_element = soup.find('a')
124 name_text = first_a_element.text.replace('------', '') if first_a_element else ''
125 name_parts = name_text.split()
126 first_name = name_parts[0] if name_parts else ''
127 last_name = name_parts[-1] if len(name_parts) > 1 else ''
128 Actor.log.info(f'{name_text} is being added to the list!')
129
130 # Get the Personal LinkedIn URL
131 first_tr = soup.find('tr')
132 first_td = first_tr.find('td') if first_tr else None
133 linkedin_element = first_td.find('a', href=lambda href: href and 'linkedin.com' in href) if first_td else None
134 linkedin_url = linkedin_element['href'] if linkedin_element else ''
135
136 # Get the Job Title
137 tds = first_tr.find_all('td') if first_tr else []
138 second_td = tds[1] if len(tds) > 1 else None
139 job_title_element = second_td.find('span') if second_td else None
140 job_title = job_title_element.text if job_title_element else ''
141
142 # Get the Company LinkedIn, Twitter, Facebook, and Company URLs and Company Name
143 third_td = tds[2] if len(tds) > 2 else None
144 social_links = {
145 'linkedin': '',
146 'twitter': '',
147 'facebook': '',
148 'company': ''
149 }
150 company_name = ''
151 links = third_td.find_all('a') if third_td else []
152 for link in links:
153 href = link['href']
154 if 'linkedin.com' in href:
155 social_links['linkedin'] = href
156 elif 'twitter.com' in href or 'x.com' in href:
157 social_links['twitter'] = href
158 elif 'facebook.com' in href:
159 social_links['facebook'] = href
160 elif link.find('i', class_='apollo-icon-link'):
161 social_links['company'] = href
162 if not company_name:
163 company_name = link.text
164
165 company_linkedin_url = social_links['linkedin']
166 company_twitter_url = social_links['twitter']
167 company_facebook_url = social_links['facebook']
168 company_url = social_links['company']
169
170 # Get the Location
171 fifth_td = tds[4] if len(tds) > 4 else None
172 location_element = fifth_td.find('span') if fifth_td else None
173 location = location_element.text if location_element else ''
174
175 # Get the Employee Count
176 sixth_td = tds[5] if len(tds) > 5 else None
177 employee_count_element = sixth_td.find('span') if sixth_td else None
178 employee = employee_count_element.text if employee_count_element else ''
179
180 # Get the Industries
181 eighth_td = tds[7] if len(tds) > 7 else None
182 industry_elements = eighth_td.find_all('span') if eighth_td else []
183 industries = ' '.join(set(industry.text.strip() for industry in industry_elements))
184
185 # Get the Keywords
186 ninth_td = tds[8] if len(tds) > 8 else None
187 keywords_elements = ninth_td.find_all('span') if ninth_td else []
188 keywords = ' '.join(set(keyword.text for keyword in keywords_elements if keyword.text.strip()))
189
190 Actor.log.info('Finding Button!')
191 # Existing try-except block for email collection
192 try:
193 try:
194 button = tbody.find_element(By.CLASS_NAME, "apollo-icon-caret-down-small")
195 except StaleElementReferenceException:
196 driver.refresh()
197 Actor.log.info('Stale Element Refresh Happening!')
198 refresh_occurred = True
199 break
200
201 Actor.log.info('Found Button!')
202 try:
203 button.click()
204 Actor.log.info('Click to Open!')
205 except ElementClickInterceptedException:
206 driver.refresh()
207 Actor.log.info('Broken Button Restarting Page!')
208 refresh_occurred = True
209 break
210 except StaleElementReferenceException:
211 driver.refresh()
212 Actor.log.info('Stale Element Refresh Happening!')
213 refresh_occurred = True
214 break
215
216
217 dialog = driver.find_element(By.CSS_SELECTOR, "div[role='dialog']")
218 email_addresses = find_email_address(dialog)
219 Actor.log.info('All is still well!')
220 email_field = 'Email'
221 if email_addresses:
222 person_data[email_field] = email_addresses[0]
223 for i, email in enumerate(email_addresses[1:], start=2):
224 person_data[f'{email_field}_{i}'] = email
225 try:
226 button.click()
227 Actor.log.info('Click to Close!')
228 except ElementClickInterceptedException:
229 body_element = driver.find_element(By.TAG_NAME, 'body')
230 body_element.send_keys(Keys.ESCAPE)
231 Actor.log.info('Oh No! Apollo Sucks!')
232 else:
233 person_data[email_field] = ''
234
235 except NoSuchElementException:
236 Actor.log.info('Button was not found!')
237 person_data['Email'] = ''
238
239 # Populate the rest of person_data
240 person_data.update({
241 'First Name': first_name,
242 'Last Name': last_name,
243 'Full Name': name_text,
244 'Personal LinkedIn': linkedin_url,
245 'Company LinkedIn': company_linkedin_url,
246 'Company Twitter': company_twitter_url,
247 'Company Facebook': company_facebook_url,
248 'Title': job_title,
249 'Company URL': company_url,
250 'Company Name': company_name,
251 'Location': location,
252 'Employee Count': employee,
253 'Industries': industries,
254 'Keywords': keywords
255 })
256
257 Actor.log.info(person_data)
258
259 page_data.append(person_data)
260
261 if refresh_occurred:
262 continue
263
264 all_data.extend(page_data)
265
266 # If less than 25 tbodies, break the loop after processing the current page
267 if len(tbodies) < 25:
268 break
269
270 # Increment the page number
271 current_page += 1
272 next_page_url = f"{base_url}{current_page}"
273 try:
274 driver.get(next_page_url)
275 Actor.log.info(f'Navigated to page {current_page}.')
276 except Exception as e:
277 Actor.log.error(f'Error navigating to page {current_page}: {str(e)}')
278 try:
279 driver.refresh()
280 driver.get(next_page_url)
281 WebDriverWait(driver, 30).until(
282 EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
283 )
284 Actor.log.info(f'Refreshed and navigated to page {current_page}.')
285 except Exception as e:
286 Actor.log.error(f'Error refreshing and navigating to page {current_page}, likely the end.')
287 break
288
289 # Save everything
290 await Actor.push_data(all_data)
291 Actor.log.info('Your Leads Are Ready!')
292
293 except Exception as e:
294 Actor.log.error(f'Error during login or scraping: {str(e)}')
295 finally:
296 driver.quit()
297
298if __name__ == "__main__":
299 Actor.run(main)
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.5.1
5selenium ~= 4.14.0
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.