# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-1",
    "title": "Getting started with Python and BeautifulSoup",
    "description": "Scrapes titles of websites using BeautifulSoup.",
    "version": "0.0",
    "meta": {
        "templateId": "python-beautifulsoup"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile",
    "storages": {
        "dataset": {
            "actorSpecification": 1,
            "title": "URLs and their titles",
            "views": {
                "titles": {
                    "title": "URLs and their titles",
                    "transformation": {
                        "fields": [
                            "url",
                            "brand",
                            "model",
                            "referenceNo",
                            "price",
                            "deliveryTime",
                            "condition",
                            "dimensions",
                            "gender",
                            "case",
                            "bracelet"
                        ]
                    },
                    "display": {
                        "component": "table",
                        "properties": {
                            "url": {
                                "label": "URL",
                                "format": "text"
                            },
                            "brand":{
                                "label": "Brand",
                                "format": "text"
                            },
                            "model": {
                                "label": "Model",
                                "format": "text"
                            },
                            "referenceNo": {
                                "label": "Reference No.",
                                "format": "text"
                            },
                            "price": {
                                "label": "Price",
                                "format": "text"
                            },
                            "deliveryTime": {
                                "label": "Delivery Time",
                                "format": "text"
                            },
                            "dimensions": {
                                "label": "Dimensions",
                                "format": "text"
                            },
                            "gender": {
                                "label": "Gender",
                                "format": "text"
                            },
                            "case": {
                                "label": "Case",
                                "format": "text"
                            },
                            "bracelet": {
                                "label": "Bracelet",
                                "format": "text"
                            }
                        }
                    }
                }
            }
        }
    }
}

.actor/input_schema.json

{
    "title": "Python BeautifulSoup Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://watchmaster.com/en/shop/rolex" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which to scrape to",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2from urllib.request import urlopen
3from bs4 import BeautifulSoup
4from apify import Actor
5
6
7async def main() -> None:
8    async with Actor:
9        actor_input = await Actor.get_input() or {}
10        start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
11        max_depth = actor_input.get('max_depth', 1)
12
13        if not start_urls:
14            Actor.log.info('No start URLs specified in actor input, exiting...')
15            await Actor.exit()
16
17        default_queue = await Actor.open_request_queue()
18        for start_url in start_urls:
19            url = start_url.get('url')
20            Actor.log.info(f'Enqueuing {url} ...')
21            await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
22
23        while request := await default_queue.fetch_next_request():
24            url = request['url']
25            Actor.log.info(f'Scraping {url} ...')
26            depth = request['userData']['depth']
27
28            try:
29                with urlopen(url) as response:
30                    html = response.read()
31                    soup = BeautifulSoup(html, 'html.parser')
32
33                    if depth < max_depth:
34                        for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):
35                            model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)
36                            brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)
37                            refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]
38                            price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)
39                            condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)
40                            deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()
41                            link = card.find('a')['href']
42
43                            detail_url = urljoin(url, link)
44                            with urlopen(detail_url) as detail_response:
45                                detail_html = detail_response.read()
46                                detail_soup = BeautifulSoup(detail_html, 'html.parser')
47
48                                product_details = detail_soup.find('div', class_='product-specifications-accordion')
49
50                                # Extract additional information
51                                dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()
52                                gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()
53                                case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()
54                                bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()
55
56                                # Append detail page data to existing data
57                                data = {
58                                    'url': detail_url,
59                                    'brand': brand,
60                                    'model': model,
61                                    'referenceNo': refno,
62                                    'price': price,
63                                    'deliveryTime': deliveryTime,
64                                    'condition': condition,
65                                    'dimensions': dimensions,
66                                    'gender': gender,
67                                    'case': case,
68                                    'bracelet': bracelet
69                                }
70
71
72                            await Actor.push_data(data)
73            except Exception:
74                Actor.log.exception(f'Cannot extract data from {url}.')
75            finally:
76                await default_queue.mark_request_as_handled(request)

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7

TikTok Scraper

clockworks/tiktok-scraper

Extract data from TikTok videos, hashtags, and users. Use URLs or search queries to scrape TikTok profiles, hashtags, posts, URLs, shares, followers, hearts, names, video, and music-related data. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

45K

4.4

Apollo.io leads scraper

curious_coder/apollo-io-scraper

Scrape leads from apollo io search results including verified emails and phone numbers. This apollo io data scraping bot helps you to extract leads from your apollo search results without subscribing to costly apollo pricing plans

Curious Coder

29K

1.8

Amazon Bestsellers Scraper

junglee/amazon-bestsellers

Scrape the Amazon Best Sellers categories and extract details on top 100 most popular items on Amazon. Download product name, price, URL, and thumbnail image. Best used on .com, .co.uk, .de, .fr, .es, and .it domains. Download your data in various formats: HTML table, JSON, CSV, Excel, and more.

Junglee

1.3K

4.3

Instagram Mentions Scraper

apify/instagram-tagged-scraper

Extract data from tagged posts and mentions on Instagram. Just add one or more Instagram usernames and get post text, hashtags, mentions, comments, images, likes, locations, and metadata. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Apify

3.9K

4.3

Facebook Reels Video Scraper

apify/facebook-reels-scraper

Extract data from hundreds of Facebook reels from one or multiple Facebook pages and profiles. Get reel URL, text, page or profile URL, timestamp, number of plays and more. Download the data in JSON, CSV, and Excel and use it in apps, spreadsheets, and reports.

Apify

694

2.9

TikTok Data Extractor

clockworks/free-tiktok-scraper

Extract data about videos, users, and channels based on hashtags or scrape full user profiles including posts, total likes, name, nickname, numbers of comments, shares, followers, following, and more.

Clockworks

28K

4.8

Google Maps Extractor

compass/google-maps-extractor

Extract data from hundreds of places fast. Scrape Google Maps by keyword, category, location, URLs & other filters. Get addresses, contact info, opening hours, popular times, prices, menus & more. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Compass

49K

4.4

Website Content Crawler

apify/website-content-crawler

Crawl websites and extract text content to feed AI models, LLM applications, vector databases, or RAG pipelines. The Actor supports rich formatting using Markdown, cleans the HTML, downloads files, and integrates well with 🦜🔗 LangChain, LlamaIndex, and the wider LLM ecosystem.

Apify

60K

3.9

Google Maps Scraper

compass/crawler-google-places

Extract data from thousands of Google Maps locations and businesses, including reviews, reviewer details, images, contact info, opening hours, location, prices & more. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Compass

124K

4.2

Instagram Scraper

apify/instagram-scraper

Scrape and download Instagram posts, profiles, places, hashtags, photos, and comments. Get data from Instagram using one or more Instagram URLs or search queries. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Apify

110K

3.9

Instagram Post Scraper

apify/instagram-post-scraper

Scrape Instagram posts. Just add one or more Instagram usernames and get your data in seconds including text, hashtags, mentions, comments, images, URLs, likes, locations, and metadata. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Apify

42K

4.5