Allegro Product Scraper avatar
Allegro Product Scraper

Deprecated

Pricing

Pay per usage

Go to Store
Allegro Product Scraper

Allegro Product Scraper

Deprecated

Developed by

qqqwe

qqqwe

Maintained by Community

0.0 (0)

Pricing

Pay per usage

0

Total users

4

Monthly users

1

Last modified

2 years ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "allegro-product-scraper",
"title": "Getting started with Python and Selenium",
"description": "Scrapes titles of websites using Selenium.",
"version": "0.0",
"meta": {
"templateId": "python-selenium"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "product offers",
"views": {
"titles": {
"title": "URLs and their titles",
"transformation": {
"fields": [
"offer_id",
"product_id",
"url",
"title",
"price",
"bought",
"account",
"is_promoted",
"is_sponsored"
]
},
"display": {
"component": "table",
"properties": {
"offer_id": {
"label": "offer_id",
"format": "text"
},
"product_id": {
"label": "product_id",
"format": "text"
},
"url": {
"label": "url",
"format": "text"
},
"title": {
"label": "title",
"format": "text"
},
"price": {
"label": "price",
"format": "text"
},
"bought": {
"label": "bought",
"format": "text"
},
"account": {
"label": "account",
"format": "text"
},
"is_promoted": {
"label": "is_promoted",
"format": "text"
},
"delivery_time": {
"label": "delivery_time",
"format": "text"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Python Selenium Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"product": {
"title": "product",
"type": "string",
"description": "allegro product id",
"editor": "textfield"
}
},
"required": ["product"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8handler = logging.StreamHandler()
9handler.setFormatter(ActorLogFormatter())
10
11apify_client_logger = logging.getLogger('apify_client')
12apify_client_logger.setLevel(logging.INFO)
13apify_client_logger.addHandler(handler)
14
15apify_logger = logging.getLogger('apify')
16apify_logger.setLevel(logging.DEBUG)
17apify_logger.addHandler(handler)
18
19asyncio.run(main())

src/main.py

1import re
2import time
3
4from urllib.parse import urljoin
5
6from apify import Actor
7from selenium import webdriver
8from selenium.webdriver.chrome.options import Options as ChromeOptions
9from selenium.webdriver.common.by import By
10
11
12async def main():
13 async with Actor:
14 actor_input = await Actor.get_input() or {}
15 product = actor_input.get('product', None)
16 print('product', product)
17
18 if product is None:
19 print('No product.')
20 return
21
22 Actor.log.info('Launching Chrome WebDriver...')
23 chrome_options = ChromeOptions()
24 # if Actor.config.headless:
25 # chrome_options.add_argument('--headless')
26 chrome_options.add_argument('--no-sandbox')
27 chrome_options.add_argument('--disable-dev-shm-usage')
28 driver = webdriver.Chrome(options=chrome_options)
29
30 time.sleep(1)
31 driver.get(f'https://allegro.pl/produkt/{product}?order=p')
32 time.sleep(2)
33 for button in driver.find_elements(By.TAG_NAME, 'button'):
34 if button.text.strip() == 'NIE ZGADZAM SIĘ':
35 button.click()
36 break
37 time.sleep(2)
38
39 print(product, 'page title', driver.title)
40
41 promoted_count, regular_count = 0, 0
42 data, account_invalid_signs = {}, ['%', ' ', 'ź']
43 for section in driver.find_elements(By.TAG_NAME, 'section'):
44 for article in section.find_elements(By.TAG_NAME, 'article'):
45 is_promoted, is_sponsored, bought, price, account, delivery_time = False, False, 0, None, None, None
46 for span in article.find_elements(By.TAG_NAME, 'span'):
47 span_text = span.text
48 if span_text == 'Promowane':
49 is_promoted = True
50 elif span_text == 'Sponsorowane':
51 is_sponsored = True
52 break
53 elif span_text == 'zł':
54 parent = span.find_element(By.XPATH, '..')
55 price = parent.text.replace(',', '.').replace('zł', '').strip()
56 price = float(price)
57 elif 'dostawa' in span_text:
58 parent = span.find_element(By.XPATH, '..')
59 delivery_time = parent.text.strip()
60 elif re.search(r'\d os(ób|oby) kupił(o|y)', span_text) is not None:
61 bought = int(span_text.split(' ')[0])
62 elif len(span_text) > 2 and span_text.isnumeric() is False:
63 valid = True
64 for i in account_invalid_signs:
65 if i in span_text:
66 valid = False
67 break
68
69 if valid is True:
70 account = span_text
71
72 if is_sponsored is True:
73 continue
74 elif is_promoted is True:
75 promoted_count += 1
76 if promoted_count > 10:
77 continue
78 else:
79 regular_count += 1
80 if regular_count > 10:
81 continue
82
83 title = article.find_element(By.XPATH, './/h2/a')
84 url = title.get_attribute('href')
85 title_text = title.text
86
87 offer_id = None
88 try:
89 if is_sponsored is True:
90 offer_id = int(url.split('%3Fbi_s%3')[0].split('-')[-1].split('?')[0])
91 else:
92 offer_id = int(url.split('-')[-1].split('?')[0])
93 except:
94 print('ERROR', title_text, url)
95 continue
96
97 if offer_id is None or offer_id in data:
98 continue
99
100 data[offer_id] = {
101 'offer_id': offer_id,
102 'product_id': product,
103 'url': url,
104 'title': title_text,
105 'price': price,
106 'bought': bought,
107 'account': account,
108 'is_promoted': is_promoted,
109 'delivery_time': delivery_time
110 }
111
112 for _, offer in data.items():
113 await Actor.push_data(offer)
114
115 driver.quit()

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.scrapy
*.log
# Added by Apify CLI
node_modules
.venv

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5selenium ~= 4.9.1