Allegro Product Scraper
Deprecated
Pricing
Pay per usage
Go to Store
Allegro Product Scraper
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
4
Monthly users
1
Last modified
2 years ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "allegro-product-scraper", "title": "Getting started with Python and Selenium", "description": "Scrapes titles of websites using Selenium.", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "product offers", "views": { "titles": { "title": "URLs and their titles", "transformation": { "fields": [ "offer_id", "product_id", "url", "title", "price", "bought", "account", "is_promoted", "is_sponsored" ] }, "display": { "component": "table", "properties": { "offer_id": { "label": "offer_id", "format": "text" }, "product_id": { "label": "product_id", "format": "text" }, "url": { "label": "url", "format": "text" }, "title": { "label": "title", "format": "text" }, "price": { "label": "price", "format": "text" }, "bought": { "label": "bought", "format": "text" }, "account": { "label": "account", "format": "text" }, "is_promoted": { "label": "is_promoted", "format": "text" }, "delivery_time": { "label": "delivery_time", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Python Selenium Scraper", "type": "object", "schemaVersion": 1, "properties": { "product": { "title": "product", "type": "string", "description": "allegro product id", "editor": "textfield" } }, "required": ["product"]}
src/__init__.py
1
src/__main__.py
1import asyncio2import logging3
4from apify.log import ActorLogFormatter5
6from .main import main7
8handler = logging.StreamHandler()9handler.setFormatter(ActorLogFormatter())10
11apify_client_logger = logging.getLogger('apify_client')12apify_client_logger.setLevel(logging.INFO)13apify_client_logger.addHandler(handler)14
15apify_logger = logging.getLogger('apify')16apify_logger.setLevel(logging.DEBUG)17apify_logger.addHandler(handler)18
19asyncio.run(main())
src/main.py
1import re2import time3
4from urllib.parse import urljoin5
6from apify import Actor7from selenium import webdriver8from selenium.webdriver.chrome.options import Options as ChromeOptions9from selenium.webdriver.common.by import By10
11
12async def main():13 async with Actor:14 actor_input = await Actor.get_input() or {}15 product = actor_input.get('product', None)16 print('product', product)17
18 if product is None:19 print('No product.')20 return21
22 Actor.log.info('Launching Chrome WebDriver...')23 chrome_options = ChromeOptions()24 # if Actor.config.headless:25 # chrome_options.add_argument('--headless')26 chrome_options.add_argument('--no-sandbox')27 chrome_options.add_argument('--disable-dev-shm-usage')28 driver = webdriver.Chrome(options=chrome_options)29
30 time.sleep(1)31 driver.get(f'https://allegro.pl/produkt/{product}?order=p')32 time.sleep(2)33 for button in driver.find_elements(By.TAG_NAME, 'button'):34 if button.text.strip() == 'NIE ZGADZAM SIĘ':35 button.click()36 break37 time.sleep(2)38
39 print(product, 'page title', driver.title)40
41 promoted_count, regular_count = 0, 042 data, account_invalid_signs = {}, ['%', ' ', 'ź']43 for section in driver.find_elements(By.TAG_NAME, 'section'):44 for article in section.find_elements(By.TAG_NAME, 'article'):45 is_promoted, is_sponsored, bought, price, account, delivery_time = False, False, 0, None, None, None46 for span in article.find_elements(By.TAG_NAME, 'span'):47 span_text = span.text48 if span_text == 'Promowane':49 is_promoted = True50 elif span_text == 'Sponsorowane':51 is_sponsored = True52 break53 elif span_text == 'zł':54 parent = span.find_element(By.XPATH, '..')55 price = parent.text.replace(',', '.').replace('zł', '').strip()56 price = float(price)57 elif 'dostawa' in span_text:58 parent = span.find_element(By.XPATH, '..')59 delivery_time = parent.text.strip()60 elif re.search(r'\d os(ób|oby) kupił(o|y)', span_text) is not None:61 bought = int(span_text.split(' ')[0])62 elif len(span_text) > 2 and span_text.isnumeric() is False:63 valid = True64 for i in account_invalid_signs:65 if i in span_text:66 valid = False67 break68
69 if valid is True:70 account = span_text71
72 if is_sponsored is True:73 continue74 elif is_promoted is True:75 promoted_count += 176 if promoted_count > 10:77 continue78 else:79 regular_count += 180 if regular_count > 10:81 continue82
83 title = article.find_element(By.XPATH, './/h2/a')84 url = title.get_attribute('href')85 title_text = title.text86
87 offer_id = None88 try:89 if is_sponsored is True:90 offer_id = int(url.split('%3Fbi_s%3')[0].split('-')[-1].split('?')[0])91 else:92 offer_id = int(url.split('-')[-1].split('?')[0])93 except:94 print('ERROR', title_text, url)95 continue96
97 if offer_id is None or offer_id in data:98 continue99
100 data[offer_id] = {101 'offer_id': offer_id,102 'product_id': product,103 'url': url,104 'title': title_text,105 'price': price,106 'bought': bought,107 'account': account,108 'is_promoted': is_promoted,109 'delivery_time': delivery_time110 }111
112 for _, offer in data.items():113 await Actor.push_data(offer)114
115 driver.quit()
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache
.scrapy*.log
# Added by Apify CLInode_modules.venv
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/3# for how to format them4apify ~= 1.1.15selenium ~= 4.9.1