# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.12

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "pycon-pokus",
    "title": "Scrape single page in Python",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "python-start"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.python.org/events/"
        }
    },
    "required": ["url"]
}

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9# Beautiful Soup - A library for pulling data out of HTML and XML files. Read more at:
10# https://www.crummy.com/software/BeautifulSoup/bs4/doc
11from bs4 import BeautifulSoup
12
13# HTTPX - A library for making asynchronous HTTP requests in Python. Read more at:
14# https://www.python-httpx.org/
15from httpx import AsyncClient
16
17# Apify SDK - A toolkit for building Apify Actors. Read more at:
18# https://docs.apify.com/sdk/python
19from apify import Actor
20
21
22async def main() -> None:
23    """Main entry point for the Apify Actor.
24
25    This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
26    Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
27    the field of web scraping significantly.
28    """
29    # start the Actor, ensures proper initialization and cleanup when the script runs inside Apify
30    async with Actor:
31        # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json.
32        actor_input = await Actor.get_input() or {'url': 'https://www.python.org/events/'}
33        url = actor_input.get('url')
34
35        # Create an asynchronous HTTPX client for making HTTP requests.
36        async with AsyncClient() as client:
37            # Fetch the HTML content of the page, following redirects if necessary.
38            Actor.log.info(f'Sending a request to {url}')
39            response = await client.get(url, follow_redirects=True)
40
41        # Defines a function to extract event details from the HTML response.
42        def extract_event_data(html):
43            # Parses the HTML using BeautifulSoup.
44            soup = BeautifulSoup(html, 'html.parser')
45            # Initializes an empty events list and sets a baseUrl for constructing full URLs.
46            events = []
47            baseUrl = 'https://www.python.org'
48            
49            # Finds all <li> elements inside .list-recent-events.menu
50            for event in soup.select('.list-recent-events.menu li'):
51                # Extract the event title <a> element.
52                title_tag = event.select_one('.event-title a')
53                # Extract the event date inside a <time> tag.
54                date_tag = event.select_one('time')
55                # Extract the event location.
56                location_tag = event.select_one('.event-location')
57                
58                # Extracts text values and ensures they have default values ('N/A' if missing).
59                title = title_tag.get_text(strip=True) if title_tag else 'N/A'
60                url = title_tag['href'] if title_tag and 'href' in title_tag.attrs else 'N/A'
61                date = date_tag.get_text(separator=' ', strip=True) if date_tag else 'N/A'
62                location = location_tag.get_text(strip=True) if location_tag else 'N/A'
63                # Constructs the full event URL by appending the relative href to baseUrl.
64                fullUrl = f"{baseUrl}{url}" if url else 'N/A'
65                
66                # Adds the extracted data into the events list.
67                events.append({
68                    'title': title,
69                    'url': fullUrl,
70                    'date': date,
71                    'location': location
72                })
73            
74            return events
75 
76        # Calls the extract_event_data() function with the page’s HTML content.
77        events = extract_event_data(response.content)
78
79        # Saves the extracted event data to Apify’s dataset storage (like a database for structured data).
80        await Actor.push_data(events)

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4

PyCon Africa demo Actor

katerinahronik/pycon-africa-demo-actor

Kateřina Hroníková

Scraper

code_crafter/scraper

Code Pioneer

9.3K

2.3

PyCon Namibia Event Scraper

ashersam01/demo-pycon-namibia

Scrapes PyCon Namibia event details, including talk titles, speakers, schedules, and venues. Outputs clean, structured JSON data for easy use.

Asher Samwaka

Reviews

hollywood-reporter/reviews

hollywood reporter

Email

contact2353/my-actor-2

TML

DE Scraper

codescraper/de-scaper

CodeScraper

5.0

Facebook Events Scraper

easyapi/facebook-events-scraper

Extract Facebook events data including title, date, cover image, ticket info and more. Perfect for event aggregators, market research, and monitoring competitor events. Supports both public and private events with authentication.

EasyApi

3.0

Serp Events Scraper

payai/serp-events-scraper

🚀 Scrape professional networking events, conferences & business meetups from Google Events via SerpAPI. Get 400+ real events across 10 major US cities in minutes!

PayAI

Actor 1

code_crafter/actor-1

Code Pioneer

Linkedin Jobs Scraper

saadmohsin/linkedin-jobs-scraper

Saad Mohsin

5.0

# First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. FROM apify/actor-python:3.12 # Second, copy just requirements.txt into the Actor image, # since it should be the only file that affects the dependency install in the next step, # in order to speed up the build COPY requirements.txt ./ # Install the packages specified in requirements.txt, # Print the installed Python version, pip version # and all installed packages with their versions for debugging RUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze # Next, copy the remaining files and directories with the source code. # Since we do this after installing the dependencies, quick build will be really fast # for most source file changes. COPY . ./ # Use compileall to ensure the runnability of the Actor Python code. RUN python3 -m compileall -q . # Specify how to launch the source code of your Actor. # By default, the "python3 -m src" command is run CMD ["python3", "-m", "src"]

{ "actorSpecification": 1, "name": "pycon-pokus", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile" }

{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.python.org/events/" } }, "required": ["url"] }

1"""This module defines the main entry point for the Apify Actor. 2 3Feel free to modify this file to suit your specific needs. 4 5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation: 6https://docs.apify.com/sdk/python 7""" 8 9# Beautiful Soup - A library for pulling data out of HTML and XML files. Read more at: 10# https://www.crummy.com/software/BeautifulSoup/bs4/doc 11from bs4 import BeautifulSoup 12 13# HTTPX - A library for making asynchronous HTTP requests in Python. Read more at: 14# https://www.python-httpx.org/ 15from httpx import AsyncClient 16 17# Apify SDK - A toolkit for building Apify Actors. Read more at: 18# https://docs.apify.com/sdk/python 19from apify import Actor 20 21 22async def main() -> None: 23 """Main entry point for the Apify Actor. 24 25 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution. 26 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in 27 the field of web scraping significantly. 28 """ 29 # start the Actor, ensures proper initialization and cleanup when the script runs inside Apify 30 async with Actor: 31 # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json. 32 actor_input = await Actor.get_input() or {'url': 'https://www.python.org/events/'} 33 url = actor_input.get('url') 34 35 # Create an asynchronous HTTPX client for making HTTP requests. 36 async with AsyncClient() as client: 37 # Fetch the HTML content of the page, following redirects if necessary. 38 Actor.log.info(f'Sending a request to {url}') 39 response = await client.get(url, follow_redirects=True) 40 41 # Defines a function to extract event details from the HTML response. 42 def extract_event_data(html): 43 # Parses the HTML using BeautifulSoup. 44 soup = BeautifulSoup(html, 'html.parser') 45 # Initializes an empty events list and sets a baseUrl for constructing full URLs. 46 events = [] 47 baseUrl = 'https://www.python.org' 48 49 # Finds all <li> elements inside .list-recent-events.menu 50 for event in soup.select('.list-recent-events.menu li'): 51 # Extract the event title <a> element. 52 title_tag = event.select_one('.event-title a') 53 # Extract the event date inside a <time> tag. 54 date_tag = event.select_one('time') 55 # Extract the event location. 56 location_tag = event.select_one('.event-location') 57 58 # Extracts text values and ensures they have default values ('N/A' if missing). 59 title = title_tag.get_text(strip=True) if title_tag else 'N/A' 60 url = title_tag['href'] if title_tag and 'href' in title_tag.attrs else 'N/A' 61 date = date_tag.get_text(separator=' ', strip=True) if date_tag else 'N/A' 62 location = location_tag.get_text(strip=True) if location_tag else 'N/A' 63 # Constructs the full event URL by appending the relative href to baseUrl. 64 fullUrl = f"{baseUrl}{url}" if url else 'N/A' 65 66 # Adds the extracted data into the events list. 67 events.append({ 68 'title': title, 69 'url': fullUrl, 70 'date': date, 71 'location': location 72 }) 73 74 return events 75 76 # Calls the extract_event_data() function with the page’s HTML content. 77 events = extract_event_data(response.content) 78 79 # Saves the extracted event data to Apify’s dataset storage (like a database for structured data). 80 await Actor.push_data(events)

.git .mise.toml .nvim.lua storage # The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: .python-version # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/

.mise.toml .nvim.lua storage # The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: .python-version # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/

1# Feel free to add your Python dependencies below. For formatting guidelines, see: 2# https://pip.pypa.io/en/latest/reference/requirements-file-format/ 3 4apify < 3.0 5beautifulsoup4[lxml] 6httpx 7types-beautifulsoup4

Pycon events demo scraper

Pycon events demo scraper

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/__main__.py

src/main.py

.dockerignore

.gitignore

requirements.txt

You might also like

PyCon Africa demo Actor

Scraper

PyCon Namibia Event Scraper

Reviews

Email

DE Scraper

Facebook Events Scraper

Serp Events Scraper

Actor 1

Linkedin Jobs Scraper

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/__main__.py

src/main.py

.dockerignore

.gitignore

requirements.txt

src/main.py

src/main.py