# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "apollo-easy-scraper",
    "title": "Scrape leads from Apollo.io",
    "description": "The easiest way to scrape thousands of real leads from Apollo.io with emails.",
    "version": "0.0",
    "meta": {
        "templateId": "python-selenium"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from Apollo.io",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "apollo_username": {
            "title": "Apollo Email 📪",
            "type": "string",
            "description": "Your Apollo Email",
            "editor": "textfield"
        },
        "apollo_password": {
            "title": "Apollo Password 🔒",
            "type": "string",
            "description": "Your Apollo Password",
            "editor": "textfield",
            "isSecret": true
        },
        "list_url": {
            "title": "List URL from Apollo 📜",
            "type": "string",
            "description": "The URL to navigate to after login.",
            "editor": "textfield"
        }
    },
    "required": ["apollo_username", "apollo_password", "list_url"]
}

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from bs4 import BeautifulSoup
2from apify import Actor
3import time
4import re
5from selenium import webdriver
6from selenium.webdriver.chrome.options import Options as ChromeOptions
7from selenium.webdriver.support import expected_conditions as EC
8from selenium.webdriver.support.ui import WebDriverWait
9from selenium.webdriver.common.keys import Keys
10from selenium.webdriver.common.by import By
11from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException 
12
13def find_email_address(element):
14    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
15    emails = re.findall(email_pattern, element.get_attribute('innerHTML'))
16    return [email for email in emails if '@sentry.io' not in email]
17
18async def main():
19    async with Actor:
20        # Read the Actor input
21        actor_input = await Actor.get_input() or {}
22        apollo_login_url = 'https://app.apollo.io/#/login'
23        apollo_username = actor_input.get('apollo_username')
24        apollo_password = actor_input.get('apollo_password')
25
26        # Get the list URL from the actor input
27        list_url = actor_input.get('list_url')
28
29        if not apollo_username or not apollo_password or not list_url:
30            Actor.log.error('Apollo credentials or list URL not provided in actor input.')
31            return
32
33        # Launch Selenium Chrome WebDriver
34        chrome_options = ChromeOptions()
35        if Actor.config.headless:
36            chrome_options.add_argument('--headless')
37        chrome_options.add_argument('--no-sandbox')
38        chrome_options.add_argument('--disable-gpu')
39        chrome_options.add_argument('--disable-dev-shm-usage')
40
41        # Disable images
42        prefs = {"profile.managed_default_content_settings.images": 2}
43        chrome_options.add_experimental_option("prefs", prefs)
44
45        driver = webdriver.Chrome(options=chrome_options)
46
47        # Perform login
48        try:
49            driver.get(apollo_login_url)
50            time.sleep(1)
51            email_input = driver.find_element(By.NAME, "email")
52            email_input.send_keys(apollo_username)
53            password_input = driver.find_element(By.ID, "current-password")
54            password_input.send_keys(apollo_password)
55            login_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
56            login_button.click()
57            time.sleep(1)
58            Actor.log.info('Successfully logged into Apollo account.')
59
60            # Navigate to the list URL
61            driver.get(list_url)
62            time.sleep(1)  
63            Actor.log.info('Successfully at the list! Ready to scrape!')
64
65            # Find tbodies and process each one
66            time.sleep(1) 
67
68            all_data = []
69
70            if 'page=' not in list_url:
71                list_url += '&page=1'
72
73            base_url, current_page = re.match(r"(.*page=)(\d+)", list_url).groups()
74            current_page = int(current_page)
75
76
77            while True:  
78
79                current_url = driver.current_url
80                Actor.log.info(f"Current URL: {current_url}")
81
82                refresh_occurred = False
83
84                page_data = []
85
86                # Refresh the driver every 20 pages
87                if current_page % 20 == 0:
88                    driver.refresh()
89                    Actor.log.info('Refreshing driver after 20 pages.')
90                
91                # Wait until tbodies are present on the page
92                try:
93                    WebDriverWait(driver, 30).until(
94                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
95                    )
96                except TimeoutException:
97                    Actor.log.error("Timed out 30 seconds waiting for tbodies to be present on the page.")
98                    driver.refresh()
99                    try:
100                        WebDriverWait(driver, 10).until(
101                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
102                        )
103                    except TimeoutException:
104                        Actor.log.error("Timed out likely finished!")
105                        break
106                try:
107                    tbodies = driver.find_elements(By.CSS_SELECTOR, "tbody")
108                except StaleElementReferenceException:
109                    driver.refresh()
110                    Actor.log.info('Stale Element Refresh Happening!')
111                    refresh_occurred = True
112                    break
113
114
115                Actor.log.info(f'On page {current_page} with {len(tbodies)} contacts found.')
116                
117                for tbody in tbodies:
118
119                    person_data = {}
120
121                    # Get the name
122                    soup = BeautifulSoup(tbody.get_attribute('outerHTML'), 'html.parser')
123                    first_a_element = soup.find('a')
124                    name_text = first_a_element.text.replace('------', '') if first_a_element else ''
125                    name_parts = name_text.split()
126                    first_name = name_parts[0] if name_parts else ''
127                    last_name = name_parts[-1] if len(name_parts) > 1 else ''
128                    Actor.log.info(f'{name_text} is being added to the list!')
129                
130                    # Get the Personal LinkedIn URL
131                    first_tr = soup.find('tr')
132                    first_td = first_tr.find('td') if first_tr else None
133                    linkedin_element = first_td.find('a', href=lambda href: href and 'linkedin.com' in href) if first_td else None
134                    linkedin_url = linkedin_element['href'] if linkedin_element else ''
135
136                    # Get the Job Title
137                    tds = first_tr.find_all('td') if first_tr else []
138                    second_td = tds[1] if len(tds) > 1 else None
139                    job_title_element = second_td.find('span') if second_td else None
140                    job_title = job_title_element.text if job_title_element else ''
141
142                    # Get the Company LinkedIn, Twitter, Facebook, and Company URLs and Company Name
143                    third_td = tds[2] if len(tds) > 2 else None
144                    social_links = {
145                        'linkedin': '',
146                        'twitter': '',
147                        'facebook': '',
148                        'company': ''
149                    }
150                    company_name = ''
151                    links = third_td.find_all('a') if third_td else []
152                    for link in links:
153                        href = link['href']
154                        if 'linkedin.com' in href:
155                            social_links['linkedin'] = href
156                        elif 'twitter.com' in href or 'x.com' in href:
157                            social_links['twitter'] = href
158                        elif 'facebook.com' in href:
159                            social_links['facebook'] = href
160                        elif link.find('i', class_='apollo-icon-link'):
161                            social_links['company'] = href
162                        if not company_name:
163                            company_name = link.text
164
165                    company_linkedin_url = social_links['linkedin']
166                    company_twitter_url = social_links['twitter']
167                    company_facebook_url = social_links['facebook']
168                    company_url = social_links['company']
169
170                    # Get the Location
171                    fifth_td = tds[4] if len(tds) > 4 else None
172                    location_element = fifth_td.find('span') if fifth_td else None
173                    location = location_element.text if location_element else ''
174                    
175                    # Get the Employee Count
176                    sixth_td = tds[5] if len(tds) > 5 else None
177                    employee_count_element = sixth_td.find('span') if sixth_td else None
178                    employee = employee_count_element.text if employee_count_element else ''
179
180                    # Get the Industries
181                    eighth_td = tds[7] if len(tds) > 7 else None
182                    industry_elements = eighth_td.find_all('span') if eighth_td else []
183                    industries = ' '.join(set(industry.text.strip() for industry in industry_elements))
184
185                    # Get the Keywords
186                    ninth_td = tds[8] if len(tds) > 8 else None
187                    keywords_elements = ninth_td.find_all('span') if ninth_td else []
188                    keywords = ' '.join(set(keyword.text for keyword in keywords_elements if keyword.text.strip()))
189
190                    Actor.log.info('Finding Button!')
191                    # Existing try-except block for email collection
192                    try:
193                        try:
194                            button = tbody.find_element(By.CLASS_NAME, "apollo-icon-caret-down-small")
195                        except StaleElementReferenceException:
196                            driver.refresh()
197                            Actor.log.info('Stale Element Refresh Happening!')
198                            refresh_occurred = True
199                            break
200                            
201                        Actor.log.info('Found Button!')
202                        try:
203                            button.click()
204                            Actor.log.info('Click to Open!')
205                        except ElementClickInterceptedException:
206                            driver.refresh()
207                            Actor.log.info('Broken Button Restarting Page!')
208                            refresh_occurred = True
209                            break
210                        except StaleElementReferenceException:
211                            driver.refresh()
212                            Actor.log.info('Stale Element Refresh Happening!')
213                            refresh_occurred = True
214                            break
215                           
216
217                        dialog = driver.find_element(By.CSS_SELECTOR, "div[role='dialog']")
218                        email_addresses = find_email_address(dialog)
219                        Actor.log.info('All is still well!')
220                        email_field = 'Email'
221                        if email_addresses:
222                            person_data[email_field] = email_addresses[0]
223                            for i, email in enumerate(email_addresses[1:], start=2):
224                                person_data[f'{email_field}_{i}'] = email
225                            try:
226                                button.click()
227                                Actor.log.info('Click to Close!')
228                            except ElementClickInterceptedException:
229                                body_element = driver.find_element(By.TAG_NAME, 'body')
230                                body_element.send_keys(Keys.ESCAPE)
231                                Actor.log.info('Oh No! Apollo Sucks!')
232                        else:
233                            person_data[email_field] = ''
234
235                    except NoSuchElementException:
236                        Actor.log.info('Button was not found!')
237                        person_data['Email'] = ''
238
239                    # Populate the rest of person_data
240                    person_data.update({
241                        'First Name': first_name,
242                        'Last Name': last_name,
243                        'Full Name': name_text,
244                        'Personal LinkedIn': linkedin_url,
245                        'Company LinkedIn': company_linkedin_url,
246                        'Company Twitter': company_twitter_url,
247                        'Company Facebook': company_facebook_url,
248                        'Title': job_title,
249                        'Company URL': company_url,
250                        'Company Name': company_name,
251                        'Location': location,
252                        'Employee Count': employee,
253                        'Industries': industries,
254                        'Keywords': keywords
255                    })
256
257                    Actor.log.info(person_data)
258
259                    page_data.append(person_data)
260
261                if refresh_occurred:
262                    continue
263
264                all_data.extend(page_data)
265
266                # If less than 25 tbodies, break the loop after processing the current page
267                if len(tbodies) < 25:
268                    break
269
270                # Increment the page number
271                current_page += 1
272                next_page_url = f"{base_url}{current_page}"
273                try:
274                    driver.get(next_page_url)
275                    Actor.log.info(f'Navigated to page {current_page}.')
276                except Exception as e:
277                    Actor.log.error(f'Error navigating to page {current_page}: {str(e)}')
278                    try:
279                        driver.refresh()
280                        driver.get(next_page_url)
281                        WebDriverWait(driver, 30).until(
282                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tbody"))
283                        )
284                        Actor.log.info(f'Refreshed and navigated to page {current_page}.')
285                    except Exception as e:
286                        Actor.log.error(f'Error refreshing and navigating to page {current_page}, likely the end.')
287                        break
288
289            # Save everything
290            await Actor.push_data(all_data) 
291            Actor.log.info('Your Leads Are Ready!')
292                
293        except Exception as e:
294            Actor.log.error(f'Error during login or scraping: {str(e)}')
295        finally:
296            driver.quit()
297
298if __name__ == "__main__":
299    Actor.run(main)

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.5.1
5selenium ~= 4.14.0

Feedly Scraper

mscraper/feedly-scraper

Feedly Scraper is a specialized web scraping tool designed to extract news from Feedly. The scraper exports the accumulated data to various formats like JSON, XML, CSV, or Excel

mscraper

Apollo Scraper: Get Leads & Emails

scrapefull/apollo-scraper

Get more leads from Apollo.io without the hassle of export limits. Our scraper grabs 10k emails monthly, plus phone numbers and job titles even on the free plan. It's great for filling your CRM or starting new outreach campaigns. Save time and grow your business faster.

Scrapefull

2.9

Reddit Scraper

epctex/reddit-scraper

Tap into the wealth of Reddit's data with our Reddit Scraper. Extract valuable insights from posts, subreddits, comments, and user data effortlessly. Simplify analysis and gain valuable insights from the diverse Reddit community with our user-friendly and efficient tool.

epctex

955

Apollo Leads Bulk Exporter

mysteriousshadow/apollo-leads-bulk-exporter

Bypass Apollo's export limit with this lightweight scraper. Subscription pricing.

Mysterious Shadow

305

Clutch.co Search Results Scraper ( Pay Per Results)

gopalakrishnan/clutch-companies-scraper

This Apify actor swiftly scrapes Clutch.co search results for lead generation. Get essential company data—names, URLs, ratings—for $2/1000 results. Integrates seamlessly with enrichment tools. Fast, efficient, and cost-effective!

Gopalakrishnan

196

Spotify Play Count Scraper

beatanalytics/spotify-play-count-scraper

Scrape play counts and statistics for Spotify artists, albums, and tracks. Download scraped data as JSON or CSV, run the scraper via API, schedule and monitor runs or integrate with other tools.

Beat Analytics

192

Olx India Scraper

natanielsantos/olx-india-scraper

Scrape ads data from olx.in with this powerful tool. You can use it to extract title, price, images, description, and more. Download your data as HTML table, JSON, CSV, Excel, XML, and RSS feed.

Nataniel Santos

133

Google Keyword Suggestions Scraper

powerai/google-keywords-suggest-scraper

Get Google keyword suggestions and insights including search volume, competition level, and bid estimates for any keyword.

PowerAI

5.0

🔥 Power Data Transformer

wiseek/power-data-transformer

Automate your entire data workflow: clean, merge, filter, deduplicate, enrich, and reshape your datasets using built-in transformation or powerful SQL pipelines — seamlessly integrated with automation platforms like n8n, Make.com, and Zapier.

wiseek

🎉 Apollo Scraper $0.8/1k leads. Upto 50k leads/URL | No cookie

microworlds/apollo-scraper

Pull up to 50k leads from a single Apollo URL | No cookie

Caleb David

6.1K

2.8

Apollo Leads Icebreaker Generator | Enrich your lead list

zenisbit.com/icebreaker-actor

Automatically generate a tailored icebreaker for every lead in your Apollo list, based on your custom prompt and powered by AI. Whether you're doing cold outreach, prospecting, or scaling personalized emails, this tool helps you stand out in the inbox and start conversations that convert.