1import re
2import time
3
4from urllib.parse import urljoin
5
6from apify import Actor
7from selenium import webdriver
8from selenium.webdriver.chrome.options import Options as ChromeOptions
9from selenium.webdriver.common.by import By
10
11
12async def main():
13 async with Actor:
14 actor_input = await Actor.get_input() or {}
15 product = actor_input.get('product', None)
16 print('product', product)
17
18 if product is None:
19 print('No product.')
20 return
21
22 Actor.log.info('Launching Chrome WebDriver...')
23 chrome_options = ChromeOptions()
24
25
26 chrome_options.add_argument('--no-sandbox')
27 chrome_options.add_argument('--disable-dev-shm-usage')
28 driver = webdriver.Chrome(options=chrome_options)
29
30 time.sleep(1)
31 driver.get(f'https://allegro.pl/produkt/{product}?order=p')
32 time.sleep(2)
33 for button in driver.find_elements(By.TAG_NAME, 'button'):
34 if button.text.strip() == 'NIE ZGADZAM SIĘ':
35 button.click()
36 break
37 time.sleep(2)
38
39 print(product, 'page title', driver.title)
40
41 promoted_count, regular_count = 0, 0
42 data, account_invalid_signs = {}, ['%', ' ', 'ź']
43 for section in driver.find_elements(By.TAG_NAME, 'section'):
44 for article in section.find_elements(By.TAG_NAME, 'article'):
45 is_promoted, is_sponsored, bought, price, account, delivery_time = False, False, 0, None, None, None
46 for span in article.find_elements(By.TAG_NAME, 'span'):
47 span_text = span.text
48 if span_text == 'Promowane':
49 is_promoted = True
50 elif span_text == 'Sponsorowane':
51 is_sponsored = True
52 break
53 elif span_text == 'zł':
54 parent = span.find_element(By.XPATH, '..')
55 price = parent.text.replace(',', '.').replace('zł', '').strip()
56 price = float(price)
57 elif 'dostawa' in span_text:
58 parent = span.find_element(By.XPATH, '..')
59 delivery_time = parent.text.strip()
60 elif re.search(r'\d os(ób|oby) kupił(o|y)', span_text) is not None:
61 bought = int(span_text.split(' ')[0])
62 elif len(span_text) > 2 and span_text.isnumeric() is False:
63 valid = True
64 for i in account_invalid_signs:
65 if i in span_text:
66 valid = False
67 break
68
69 if valid is True:
70 account = span_text
71
72 if is_sponsored is True:
73 continue
74 elif is_promoted is True:
75 promoted_count += 1
76 if promoted_count > 10:
77 continue
78 else:
79 regular_count += 1
80 if regular_count > 10:
81 continue
82
83 title = article.find_element(By.XPATH, './/h2/a')
84 url = title.get_attribute('href')
85 title_text = title.text
86
87 offer_id = None
88 try:
89 if is_sponsored is True:
90 offer_id = int(url.split('%3Fbi_s%3')[0].split('-')[-1].split('?')[0])
91 else:
92 offer_id = int(url.split('-')[-1].split('?')[0])
93 except:
94 print('ERROR', title_text, url)
95 continue
96
97 if offer_id is None or offer_id in data:
98 continue
99
100 data[offer_id] = {
101 'offer_id': offer_id,
102 'product_id': product,
103 'url': url,
104 'title': title_text,
105 'price': price,
106 'bought': bought,
107 'account': account,
108 'is_promoted': is_promoted,
109 'delivery_time': delivery_time
110 }
111
112 for _, offer in data.items():
113 await Actor.push_data(offer)
114
115 driver.quit()