Apollo Easy Scrape 2 avatar
Apollo Easy Scrape 2

Deprecated

Pricing

Pay per usage

Go to Store
Apollo Easy Scrape 2

Apollo Easy Scrape 2

Deprecated

Developed by

Mike Powers

Maintained by Community

0.0 (0)

Pricing

Pay per usage

0

Total users

246

Monthly users

82

Last modified

a year ago

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-selenium:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "apollo-easy-scraper",
4    "title": "Scrape leads from Apollo.io",
5    "description": "The easiest way to scrape thousands of real leads from Apollo.io with emails.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-selenium"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from Apollo.io",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "apollo_username": {
7            "title": "Apollo Email 📪",
8            "type": "string",
9            "description": "Your Apollo Email",
10            "editor": "textfield"
11        },
12        "apollo_password": {
13            "title": "Apollo Password 🔒",
14            "type": "string",
15            "description": "Your Apollo Password",
16            "editor": "textfield",
17            "isSecret": true
18        },
19        "list_url": {
20            "title": "List URL from Apollo 📜",
21            "type": "string",
22            "description": "The URL to navigate to after login.",
23            "editor": "textfield"
24        }
25    },
26    "required": ["apollo_username", "apollo_password", "list_url"]
27}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from bs4 import BeautifulSoup
2from apify import Actor
3import time
4import re
5from selenium import webdriver
6from selenium.webdriver.chrome.options import Options as ChromeOptions
7from selenium.webdriver.support import expected_conditions as EC
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.common.keys import Keys
10from selenium.webdriver.common.by import By
11from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException 
12
13def find_email_address(element):
14    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
15    emails = re.findall(email_pattern, element.get_attribute('innerHTML'))
16    return [email for email in emails if '@sentry.io' not in email]
17
18async def main():
19    async with Actor:
20        # Read the Actor input
21        actor_input = await Actor.get_input() or {}
22        apollo_login_url = 'https://app.apollo.io/#/login'
23        apollo_username = actor_input.get('apollo_username')
24        apollo_password = actor_input.get('apollo_password')
25
26        # Get the list URL from the actor input
27        list_url = actor_input.get('list_url')
28
29        if not apollo_username or not apollo_password or not list_url:
30            Actor.log.error('Apollo credentials or list URL not provided in actor input.')
31            return
32
33        # Launch Selenium Chrome WebDriver
34        chrome_options = ChromeOptions()
35        if Actor.config.headless:
36            chrome_options.add_argument('--headless')
37        chrome_options.add_argument('--no-sandbox')
38        chrome_options.add_argument('--disable-gpu')
39        chrome_options.add_argument('--disable-dev-shm-usage')
40
41        # Disable images
42        prefs = {"profile.managed_default_content_settings.images": 2}
43        chrome_options.add_experimental_option("prefs", prefs)
44
45        driver = webdriver.Chrome(options=chrome_options)
46
47        # Perform login
48        try:
49            driver.get(apollo_login_url)
50            time.sleep(1)
51            email_input = driver.find_element(By.NAME, "email")
52            email_input.send_keys(apollo_username)
53            password_input = driver.find_element(By.ID, "current-password")
54            password_input.send_keys(apollo_password)
55            login_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
56            login_button.click()
57            time.sleep(1)
58            Actor.log.info('Successfully logged into Apollo account.')
59
60            # Navigate to the list URL
61            driver.get(list_url)
62            time.sleep(1)  
63            Actor.log.info('Successfully at the list! Ready to scrape!')
64
65            # Find tbodies and process each one
66            time.sleep(1) 
67
68            all_data = []
69
70            if 'page=' not in list_url:
71                list_url += '&page=1'
72
73            base_url, current_page = re.match(r"(.*page=)(\d+)", list_url).groups()
74            current_page = int(current_page)
75
76
77            while True:  
78
79                current_url = driver.current_url
80                Actor.log.info(f"Current URL: {current_url}")
81
82                refresh_occurred = False
83
84                page_data = []
85
86                # Refresh the driver every 20 pages
87                if current_page % 20 == 0:
88                    driver.refresh()
89                    Actor.log.info('Refreshing driver after 20 pages.')
90                
91                # Wait until tbodies are present on the page
92                try:
93                    WebDriverWait(driver, 30).until(
94                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
95                    )
96                except TimeoutException:
97                    Actor.log.error("Timed out 30 seconds waiting for tbodies to be present on the page.")
98                    driver.refresh()
99                    try:
100                        WebDriverWait(driver, 10).until(
101                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
102                        )
103                    except TimeoutException:
104                        Actor.log.error("Timed out likely finished!")
105                        break
106                try:
107                    tbodies = driver.find_elements(By.CSS_SELECTOR, "tbody")
108                except StaleElementReferenceException:
109                    driver.refresh()
110                    Actor.log.info('Stale Element Refresh Happening!')
111                    refresh_occurred = True
112                    break
113
114
115                Actor.log.info(f'On page {current_page} with {len(tbodies)} contacts found.')
116                
117                for tbody in tbodies:
118
119                    person_data = {}
120
121                    # Get the name
122                    soup = BeautifulSoup(tbody.get_attribute('outerHTML'), 'html.parser')
123                    first_a_element = soup.find('a')
124                    name_text = first_a_element.text.replace('------', '') if first_a_element else ''
125                    name_parts = name_text.split()
126                    first_name = name_parts[0] if name_parts else ''
127                    last_name = name_parts[-1] if len(name_parts) > 1 else ''
128                    Actor.log.info(f'{name_text} is being added to the list!')
129                
130                    # Get the Personal LinkedIn URL
131                    first_tr = soup.find('tr')
132                    first_td = first_tr.find('td') if first_tr else None
133                    linkedin_element = first_td.find('a', href=lambda href: href and 'linkedin.com' in href) if first_td else None
134                    linkedin_url = linkedin_element['href'] if linkedin_element else ''
135
136                    # Get the Job Title
137                    tds = first_tr.find_all('td') if first_tr else []
138                    second_td = tds[1] if len(tds) > 1 else None
139                    job_title_element = second_td.find('span') if second_td else None
140                    job_title = job_title_element.text if job_title_element else ''
141
142                    # Get the Company LinkedIn, Twitter, Facebook, and Company URLs and Company Name
143                    third_td = tds[2] if len(tds) > 2 else None
144                    social_links = {
145                        'linkedin': '',
146                        'twitter': '',
147                        'facebook': '',
148                        'company': ''
149                    }
150                    company_name = ''
151                    links = third_td.find_all('a') if third_td else []
152                    for link in links:
153                        href = link['href']
154                        if 'linkedin.com' in href:
155                            social_links['linkedin'] = href
156                        elif 'twitter.com' in href or 'x.com' in href:
157                            social_links['twitter'] = href
158                        elif 'facebook.com' in href:
159                            social_links['facebook'] = href
160                        elif link.find('i', class_='apollo-icon-link'):
161                            social_links['company'] = href
162                        if not company_name:
163                            company_name = link.text
164
165                    company_linkedin_url = social_links['linkedin']
166                    company_twitter_url = social_links['twitter']
167                    company_facebook_url = social_links['facebook']
168                    company_url = social_links['company']
169
170                    # Get the Location
171                    fifth_td = tds[4] if len(tds) > 4 else None
172                    location_element = fifth_td.find('span') if fifth_td else None
173                    location = location_element.text if location_element else ''
174                    
175                    # Get the Employee Count
176                    sixth_td = tds[5] if len(tds) > 5 else None
177                    employee_count_element = sixth_td.find('span') if sixth_td else None
178                    employee = employee_count_element.text if employee_count_element else ''
179
180                    # Get the Industries
181                    eighth_td = tds[7] if len(tds) > 7 else None
182                    industry_elements = eighth_td.find_all('span') if eighth_td else []
183                    industries = ' '.join(set(industry.text.strip() for industry in industry_elements))
184
185                    # Get the Keywords
186                    ninth_td = tds[8] if len(tds) > 8 else None
187                    keywords_elements = ninth_td.find_all('span') if ninth_td else []
188                    keywords = ' '.join(set(keyword.text for keyword in keywords_elements if keyword.text.strip()))
189
190                    Actor.log.info('Finding Button!')
191                    # Existing try-except block for email collection
192                    try:
193                        try:
194                            button = tbody.find_element(By.CLASS_NAME, "apollo-icon-caret-down-small")
195                        except StaleElementReferenceException:
196                            driver.refresh()
197                            Actor.log.info('Stale Element Refresh Happening!')
198                            refresh_occurred = True
199                            break
200                            
201                        Actor.log.info('Found Button!')
202                        try:
203                            button.click()
204                            Actor.log.info('Click to Open!')
205                        except ElementClickInterceptedException:
206                            driver.refresh()
207                            Actor.log.info('Broken Button Restarting Page!')
208                            refresh_occurred = True
209                            break
210                        except StaleElementReferenceException:
211                            driver.refresh()
212                            Actor.log.info('Stale Element Refresh Happening!')
213                            refresh_occurred = True
214                            break
215                           
216
217                        dialog = driver.find_element(By.CSS_SELECTOR, "div[role='dialog']")
218                        email_addresses = find_email_address(dialog)
219                        Actor.log.info('All is still well!')
220                        email_field = 'Email'
221                        if email_addresses:
222                            person_data[email_field] = email_addresses[0]
223                            for i, email in enumerate(email_addresses[1:], start=2):
224                                person_data[f'{email_field}_{i}'] = email
225                            try:
226                                button.click()
227                                Actor.log.info('Click to Close!')
228                            except ElementClickInterceptedException:
229                                body_element = driver.find_element(By.TAG_NAME, 'body')
230                                body_element.send_keys(Keys.ESCAPE)
231                                Actor.log.info('Oh No! Apollo Sucks!')
232                        else:
233                            person_data[email_field] = ''
234
235                    except NoSuchElementException:
236                        Actor.log.info('Button was not found!')
237                        person_data['Email'] = ''
238
239                    # Populate the rest of person_data
240                    person_data.update({
241                        'First Name': first_name,
242                        'Last Name': last_name,
243                        'Full Name': name_text,
244                        'Personal LinkedIn': linkedin_url,
245                        'Company LinkedIn': company_linkedin_url,
246                        'Company Twitter': company_twitter_url,
247                        'Company Facebook': company_facebook_url,
248                        'Title': job_title,
249                        'Company URL': company_url,
250                        'Company Name': company_name,
251                        'Location': location,
252                        'Employee Count': employee,
253                        'Industries': industries,
254                        'Keywords': keywords
255                    })
256
257                    Actor.log.info(person_data)
258
259                    page_data.append(person_data)
260
261                if refresh_occurred:
262                    continue
263
264                all_data.extend(page_data)
265
266                # If less than 25 tbodies, break the loop after processing the current page
267                if len(tbodies) < 25:
268                    break
269
270                # Increment the page number
271                current_page += 1
272                next_page_url = f"{base_url}{current_page}"
273                try:
274                    driver.get(next_page_url)
275                    Actor.log.info(f'Navigated to page {current_page}.')
276                except Exception as e:
277                    Actor.log.error(f'Error navigating to page {current_page}: {str(e)}')
278                    try:
279                        driver.refresh()
280                        driver.get(next_page_url)
281                        WebDriverWait(driver, 30).until(
282                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
283                        )
284                        Actor.log.info(f'Refreshed and navigated to page {current_page}.')
285                    except Exception as e:
286                        Actor.log.error(f'Error refreshing and navigating to page {current_page}, likely the end.')
287                        break
288
289            # Save everything
290            await Actor.push_data(all_data) 
291            Actor.log.info('Your Leads Are Ready!')
292                
293        except Exception as e:
294            Actor.log.error(f'Error during login or scraping: {str(e)}')
295        finally:
296            driver.quit()
297
298if __name__ == "__main__":
299    Actor.run(main)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.5.1
5selenium ~= 4.14.0

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.