# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.13

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Getting started with Python and Selenium",
    "description": "Scrapes titles of websites using Selenium.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "python-selenium"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Python Selenium Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which to scrape to",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/init.py

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1from selenium import webdriver
2from apify import Actor
3import logging
4
5# Set up logging for better feedback
6logging.basicConfig(level=logging.INFO)
7logger = logging.getLogger(__name__)
8
9async def main():
10    async with Actor:
11        # Chrome options
12        options = webdriver.ChromeOptions()
13        options.add_argument('--headless')
14        options.add_argument('--no-sandbox')
15        options.add_argument('--disable-dev-shm-usage')
16
17        # Launch browser
18        driver = webdriver.Chrome(options=options)
19        logger.info("Browser launched")
20
21        # Use your updated Citi URL
22        url = "https://jobs.citi.com/search-jobs/Data%20Analyst/India/287/1/2/1269750/22/79/50/2"
23        driver.get(url)
24        logger.info(f"Opened URL: {url}")
25
26        # Wait longer for jobs to load
27        driver.implicitly_wait(15)
28        logger.info("Waiting for page load")
29
30        # Try a broader selector to find job elements
31        jobs = driver.find_elements("tag name", "li")  # Look for <li> tags (common for lists)
32        logger.info(f"Found {len(jobs)} elements with 'li' tag")
33
34        # Loop through potential jobs
35        for job in jobs:
36            try:
37                title_elem = job.find_element("tag name", "h2")  # Try <h2> for titles
38                title = title_elem.text
39                location = job.find_element("class name", "job-location").text  # Guess class
40                link = job.find_element("tag name", "a").get_attribute("href")
41                if "data analyst" in title.lower():  # Filter for relevant jobs
42                    await Actor.push_data({
43                        "title": title,
44                        "location": location,
45                        "link": link
46                    })
47                    logger.info(f"Found: {title} - {location}")
48            except Exception as e:
49                logger.info(f"Skipped a job - error: {str(e)}")
50
51        # Check page source for debugging
52        page_source = driver.page_source
53        if "data analyst" in page_source.lower():
54            logger.info("Page contains 'data analyst' - selectors might be wrong")
55        else:
56            logger.warning("No 'data analyst' in page source - check URL")
57
58        # Close browser
59        driver.quit()
60        logger.info("Browser closed")

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5selenium

Actors MCP Server

apify/actors-mcp-server

⚠️ Legacy: This Actor is outdated. For the latest features and full documentation, visit https://mcp.apify.com. Easily connect any Apify Actor to AI agents using Anthropic’s Model Context Protocol (MCP) with our actively maintained MCP server.

Apify

1.7K

4.7

Facebook Search Actor

mina_safwat/facebook-search-actor

If you want to search on Facebook at a large scale, this is the best tool available, offering the best accuracy and price.

Mina Safwat

5.0

Perplexity AI Instant Response Actor

scraping_ronin/perplexity-ai-instant-response-actor

The Perplexity AI Actor allows you to interact with the Perplexity effortlessly. Simply input your queries and customize your results with system-level instructions and model selection. Enjoy customizable responses with no effort!

Ronin

5.0

Create Mini Actor

valek.josef/create-mini-actor

Josef Válek

Actor Compute Units Aggregator

lukaskrivka/check-compute-per-actor

Aggregates daily or monthly usage of compute units for all your actors. Please don't use this if you have thousands of daily runs as it will overload the Apify API.

Lukáš Křivka

Actor Readme Generator

apify/actor-readme-generator

Generates READMEs scrapers using ChatGPT, based on an Apify-approved template.

Apify

4.6

DeepL (AI translation) Actor

tkapler/deepl-actor

Receive high-quality translations from/to 24 languages using the DeepL API. It uses a proprietary algorithm with convolutional neural networks (CNNs) and translate far better than e.g. Google Translate. Free Tier is available.

Tomas Kapler

Actor Costs

lukaskrivka/actor-costs

Get costs and usage stats for your actor use aggregated daily. The actor also provides summary stats for the whole period.

Lukáš Křivka

Actor Inspector Agent

jakub.kopecky/actor-inspector-agent

Agent Actor Inspector 🕵️‍♂️: An Apify Actor that rates others on docs 📝, inputs 🔍, code 💻, functionality ⚙️, performance ⏱️, and uniqueness 🌟. Config with actorId array, run, and review results. Helps devs improve, ensures quality, and guides users.