# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11

# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
	"actorSpecification": 1,
	"name": "allegro-product-scraper",
	"title": "Getting started with Python and Selenium",
	"description": "Scrapes titles of websites using Selenium.",
	"version": "0.0",
	"meta": {
		"templateId": "python-selenium"
	},
	"input": "./input_schema.json",
	"dockerfile": "./Dockerfile",
	"storages": {
		"dataset": {
			"actorSpecification": 1,
			"title": "product offers",
			"views": {
				"titles": {
					"title": "URLs and their titles",
					"transformation": {
						"fields": [
							"offer_id",
							"product_id",
							"url",
							"title",
							"price",
							"bought",
							"account",
							"is_promoted",
							"is_sponsored"
						]
					},
					"display": {
						"component": "table",
						"properties": {
							"offer_id": {
								"label": "offer_id",
								"format": "text"
							},
							"product_id": {
								"label": "product_id",
								"format": "text"
							},
							"url": {
								"label": "url",
								"format": "text"
							},
							"title": {
								"label": "title",
								"format": "text"
							},
							"price": {
								"label": "price",
								"format": "text"
							},
							"bought": {
								"label": "bought",
								"format": "text"
							},
							"account": {
								"label": "account",
								"format": "text"
							},
							"is_promoted": {
								"label": "is_promoted",
								"format": "text"
							},
							"delivery_time": {
								"label": "delivery_time",
								"format": "text"
							}
						}
					}
				}
			}
		}
	}
}

.actor/input_schema.json

{
    "title": "Python Selenium Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "product": {
            "title": "product",
            "type": "string",
            "description": "allegro product id",
            "editor": "textfield"
        }
    },
    "required": ["product"]
}

src/init.py

src/main.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8handler = logging.StreamHandler()
9handler.setFormatter(ActorLogFormatter())
10
11apify_client_logger = logging.getLogger('apify_client')
12apify_client_logger.setLevel(logging.INFO)
13apify_client_logger.addHandler(handler)
14
15apify_logger = logging.getLogger('apify')
16apify_logger.setLevel(logging.DEBUG)
17apify_logger.addHandler(handler)
18
19asyncio.run(main())

src/main.py

1import re
2import time
3
4from urllib.parse import urljoin
5
6from apify import Actor
7from selenium import webdriver
8from selenium.webdriver.chrome.options import Options as ChromeOptions
9from selenium.webdriver.common.by import By
10
11
12async def main():
13    async with Actor:
14        actor_input = await Actor.get_input() or {}
15        product = actor_input.get('product', None)
16        print('product', product)
17
18        if product is None:
19            print('No product.')
20            return
21
22        Actor.log.info('Launching Chrome WebDriver...')
23        chrome_options = ChromeOptions()
24        # if Actor.config.headless:
25        #     chrome_options.add_argument('--headless')
26        chrome_options.add_argument('--no-sandbox')
27        chrome_options.add_argument('--disable-dev-shm-usage')
28        driver = webdriver.Chrome(options=chrome_options)
29
30        time.sleep(1)
31        driver.get(f'https://allegro.pl/produkt/{product}?order=p')
32        time.sleep(2)
33        for button in driver.find_elements(By.TAG_NAME, 'button'):
34            if button.text.strip() == 'NIE ZGADZAM SIĘ':
35                button.click()
36                break
37        time.sleep(2)
38
39        print(product, 'page title', driver.title)
40
41        promoted_count, regular_count = 0, 0
42        data, account_invalid_signs = {}, ['%', ' ', 'ź']
43        for section in driver.find_elements(By.TAG_NAME, 'section'):
44            for article in section.find_elements(By.TAG_NAME, 'article'):
45                is_promoted, is_sponsored, bought, price, account, delivery_time = False, False, 0, None, None, None
46                for span in article.find_elements(By.TAG_NAME, 'span'):
47                    span_text = span.text
48                    if span_text == 'Promowane':
49                        is_promoted = True
50                    elif span_text == 'Sponsorowane':
51                        is_sponsored = True
52                        break
53                    elif span_text == 'zł':
54                        parent = span.find_element(By.XPATH, '..')
55                        price = parent.text.replace(',', '.').replace('zł', '').strip()
56                        price = float(price)
57                    elif 'dostawa' in span_text:
58                        parent = span.find_element(By.XPATH, '..')
59                        delivery_time = parent.text.strip()
60                    elif re.search(r'\d os(ób|oby) kupił(o|y)', span_text) is not None:
61                        bought = int(span_text.split(' ')[0])
62                    elif len(span_text) > 2 and span_text.isnumeric() is False:
63                        valid = True
64                        for i in account_invalid_signs:
65                            if i in span_text:
66                                valid = False
67                                break
68
69                        if valid is True:
70                            account = span_text
71
72                if is_sponsored is True:
73                    continue
74                elif is_promoted is True:
75                    promoted_count += 1
76                    if promoted_count > 10:
77                        continue
78                else:
79                    regular_count += 1
80                    if regular_count > 10:
81                        continue
82
83                title = article.find_element(By.XPATH, './/h2/a')
84                url = title.get_attribute('href')
85                title_text = title.text
86
87                offer_id = None
88                try:
89                    if is_sponsored is True:
90                        offer_id = int(url.split('%3Fbi_s%3')[0].split('-')[-1].split('?')[0])
91                    else:
92                        offer_id = int(url.split('-')[-1].split('?')[0])
93                except:
94                    print('ERROR', title_text, url)
95                    continue
96
97                if offer_id is None or offer_id in data:
98                    continue
99
100                data[offer_id] = {
101                    'offer_id': offer_id,
102                    'product_id': product,
103                    'url': url,
104                    'title': title_text,
105                    'price': price,
106                    'bought': bought,
107                    'account': account,
108                    'is_promoted': is_promoted,
109                    'delivery_time': delivery_time
110                }
111
112        for _, offer in data.items():
113            await Actor.push_data(offer)
114
115        driver.quit()

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache

.scrapy
*.log

# Added by Apify CLI
node_modules
.venv

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5selenium ~= 4.9.1

Allegro Fast Product Scraper

tri_angle/allegro-fast-product-scraper

Allegro Scraper allows you to fastly scrape data from allegro.pl, allegro.cz and allegro.sk from the listing page , one of the most popular online shopping platforms in Europe and the largest e-commerce platform of European origin.

Tri⟁angle

264

4.5

Allegro Product Detail Scraper

tri_angle/allegro-product-detail-scraper

Allegro Scraper allows you to scrape data from allegro.pl, allegro.cz and allegro.sk, one of the most popular online shopping platforms in Europe and the largest e-commerce platform of European origin.

Tri⟁angle

163

3.8

E-commerce Scraping Tool

apify/e-commerce-scraping-tool

Extract product data from any online retail platform, including Amazon, Walmart, and eBay. Add URLs of the products, and extract product name, price, currency, image, product description, product ID. Export scraped data, run scraper via API, schedule and monitor runs or integrate with other tools.

Apify

122

5.0

Zoot Scraper

lhotanova/zoot-scraper

Efficiently gather essential product data from Zoot, a leading Czech fashion retailer. Scrape product details, prices, sizes, brands, and more with ease using Zoot Scraper. Boost your e-commerce insights effortlessly. Works with CZ, SK, and RO domains: zoot.cz, zoot.sk and zoot.ro.

Kristýna Lhoťanová

Zalando Scraper

lhotanova/zalando-scraper

Scrape product data from Zalando, such as images, prices, brands or product attributes. You can extract data from any of the available Zalando domains - zalando.co.uk, zalando.de, zalando.fr, zalando.it and others. Search products by categories or provide URLs of concrete products.

Kristýna Lhoťanová

Allegro Email Scraper

scraper-mind/Allegro-email-scraper

Boost your outreach with Allegro Email Scraper—extract targeted emails fast using keywords & location filters. Perfect for marketers & researchers!

Scraper Mind

Genius Scraper

epctex/genius-scraper

The Genius.com scraper extracts songs and lyrics from the platform. It fetches details from song, album, artist pages, supports tag-based retrieval, searches, and includes comments. Simplify accessing and analyzing music content from Genius.com.

epctex

163

Amazon Product Scraper

junglee/Amazon-crawler

Use this Amazon scraper to collect data based on URL and country from the Amazon website. Extract product information without using the Amazon API, including reviews, prices, descriptions, and Amazon Standard Identification Numbers (ASINs). Download data in various structured formats.

Junglee

9.8K

4.6

Advanced Product Hunt Scraper

danpoletaev/product-hunt-scraper

Scrape product hunt "Top Products Launching Today" section. Actor crawls products and extracts information about the product: title, description, categories, images, maker info with contact links and website info with raw text and email. Export scraped datasets in JSON, csv, etc. Run via API.

Danil Poletaev

510

5.0

seloger mass products scraper (by search URL) ⚡

azzouzana/seloger-mass-products-scraper-by-search-url

🔥Très simple! Entrez le lien vers la page de recherche et obtenir les résultats! ⚡ Extraire rapidement les infos détaillées sur les propriétés ( titre, description, photos, évaluations énergétique prix, contacts, transport et plus encore) à faible coût, avec exportation en JSON, CSV, HTML, EXCEL...

Azzouzana

5.0

Target.com Product Search Scraper

ecomscrape/target-product-search-scraper

The Target.com Product Search Scraper extracts detailed product data from Target.com, including name, title, brand, description, price, rating score, etc., using search query URLs. This tool is perfect for market research, trend analysis, lead generation, and campaign planning.