# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.12

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-8",
    "title": "Getting started with Python and Selenium",
    "description": "Scrapes titles of websites using Selenium.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "python-selenium"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Python Selenium Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which to scrape to",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9import asyncio
10from urllib.parse import urljoin
11from selenium.webdriver.support.ui import WebDriverWait
12from selenium.webdriver.support import expected_conditions as EC
13import undetected_chromedriver as uc
14from selenium.webdriver.chrome.options import Options as ChromeOptions
15from selenium.webdriver.common.by import By
16
17from apify import Actor, Request
18
19# To run this Actor locally, you need to have the Selenium Chromedriver installed.
20# Follow the installation guide at:
21# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
22# When running on the Apify platform, the Chromedriver is already included
23# in the Actor's Docker image.
24
25async def main() -> None:
26    """Main entry point for the Apify Actor.
27
28    This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
29    Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
30    the field of web scraping significantly.
31    """
32    # Enter the context of the Actor.
33    async with Actor:
34        # Retrieve the Actor input, and use default values if not provided.
35        actor_input = await Actor.get_input() or {}
36        start_urls = actor_input.get('start_urls')
37        max_depth = actor_input.get('max_depth', 1)
38
39        # Exit if no start URLs are provided.
40        if not start_urls:
41            Actor.log.info('No start URLs specified in actor input, exiting...')
42            await Actor.exit()
43
44        # Open the default request queue for handling URLs to be processed.
45        request_queue = await Actor.open_request_queue()
46        Actor.log.info('Launching Chrome WebDriver...')
47        chrome_options = ChromeOptions()
48
49        chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Hides Selenium use
50        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
51        chrome_options.add_argument('--no-sandbox')
52        chrome_options.add_argument('--disable-dev-shm-usage')
53        driver = uc.Chrome(options=chrome_options)
54        # Enqueue the start URLs with an initial crawl depth of 0.
55        for start_url in start_urls:
56            url = start_url.get('url')
57            Actor.log.info(f'Enqueuing {url} ...')
58            new_request = Request.from_url(url, user_data={'depth': 0})
59            await request_queue.add_request(new_request)
60            driver.get(url)
61
62        # Launch a new Selenium Chrome WebDriver and configure it.
63        
64
65        # Test WebDriver setup by navigating to an example page.
66        
67
68        # Process the URLs from the request queue.
69        while request := await request_queue.fetch_next_request():
70            url = request.url
71
72            if not isinstance(request.user_data['depth'], (str, int)):
73                raise TypeError('Request.depth is an enexpected type.')
74
75            depth = int(request.user_data['depth'])
76            final = driver.current_url+'/print'
77            
78            Actor.log.info(f'Scraping {final} (depth={depth}) ...')
79            driver.get(final)
80            try:
81
82                # for non-blocking execution.
83                await asyncio.to_thread(driver.get, final)
84        
85                # # Take a screenshot (optional for debugging)
86                # driver.save_screenshot("screenshot.png")
87
88                # Extract page content
89                body_content = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML")
90            
91                data = {
92                    'url': final,
93                    'title': driver.title,
94                    'body': body_content,
95                }
96                print(data)
97                import time
98                
99
100                # Store the extracted data to the default dataset.
101                await Actor.push_data(data)
102                time.sleep(100)
103
104            except Exception:
105                Actor.log.exception(f'Cannot extract data from {url}.')
106
107            finally:
108                # Mark the request as handled to ensure it is not processed again.
109                # await request_queue.mark_request_as_handled(request)
110                pass
111
112        driver.quit()

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5selenium
6webdriver-manager < 4.0.2
7undetected-chromedriver < 3.5.5

🔥 ImmoScout24 Scraper (API) Pro

clearpath/immoscout24-api-pro

ImmobilienScout24.de Scraper (API) with 50+ data points per property. Extract realtor contact info, investment data, detailed amenities. Perfect for real estate investors, property managers & market research. Enterprise-grade German property intelligence.

ClearPath

🔥 ImmoScout24 Scraper (API) Lite

clearpath/immoscout24-api-lite

ImmobilienScout24.de Scraper (API) for German real estate monitoring. Lightning-fast. Track new rental listings with real-time Telegram alerts. 90% cheaper than browser scrapers. Perfect ImmoScout24.de automation tool for apartment hunting and property data extraction.

ClearPath

ImmoScout24 Property Search Scraper

ecomscrape/immoscout24-property-search-scraper

Unlock comprehensive Swiss real estate data with our Immoscout24.ch scraper. Extract property listings, pricing, and market insights from Switzerland's leading real estate platform with advanced filtering and automated data collection capabilities.

ecomscrape

🏡 ImmoScout24.de Detail-Listing Scraper (API)

clearpath/immoscout24-detail-listing-scraper

The 🏡 ImmobilienScout24.de Detail-Listing Scraper (API) processes a list of ImmoScout24.de URLs and extracts comprehensive property data. This actor enables you to gather detailed information from ImmoScout24 listings for research, analysis, or integration with other systems.

ClearPath

Immobilienscout24 | Search | Detail(s) page |Scraper| $2/1K

memo23/immobilienscout24-scraper

Extract comprehensive German real estate data: property details (rent, size, rooms, location), high-res images, agent info with verification status, pricing insights, amenities (balcony, kitchen, cellar), contact forms, market analytics, and 40+ targeting parameters for research.

Muhamed Didovic

Immobilienscout24 | Search | Detail(s) page | Scraper| Rental

memo23/immobilienscout24-search-scraper-rental

Muhamed Didovic

ImmoScout24 Property Details Scraper

ecomscrape/immoscout24-property-details-scraper

Extract comprehensive property details from Immoscout24.ch, Switzerland's leading real estate platform. Get property prices, specifications, contact information, and market insights with our automated scraper tool. Ideal for real estate professionals and market analysts.

ecomscrape

Immobilien-Suchmaschine

michaelhaar/immobilien-suchmaschine

Finde Wohnungen, Häuser & Grundstücke im Internet (Scraper für Willhaben, Immowelt, Remax, Immoscout24)

Michael Haar

143

5.0

(4)

Homegate Property Details Scraper

ecomscrape/homegate-property-details-scraper

Powerful Homegate.ch property detail scraper for extracting comprehensive real estate data from Switzerland's largest property marketplace. Get property details, pricing, locations, and contact information automatically with high accuracy and reliability.

ecomscrape

Homegate Property Search Scraper

ecomscrape/homegate-property-search-scraper

Extract comprehensive property data from Homegate.ch, Switzerland's largest real estate marketplace. Access detailed property listings, pricing, and location data from over 100,000 properties with automated scraping technology.

ecomscrape

Real Estate Email Scraper – Cheap & Advanced 🏡📧

scrapestorm/real-estate-email-scraper---cheap-advanced

🔍 Scrape Real Estate Emails Easily Enter your search parameters (e.g. property keywords, email domains & platform) to collect verified agent or agency contacts along with listing title, description snippet & more 📊 Perfect for real estate lead generation, property promotions & market intelligence