1from urllib.parse import urljoin
2from urllib.request import urlopen
3from bs4 import BeautifulSoup
4from apify import Actor
5
6
7async def main() -> None:
8 async with Actor:
9 actor_input = await Actor.get_input() or {}
10 start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
11 max_depth = actor_input.get('max_depth', 1)
12
13 if not start_urls:
14 Actor.log.info('No start URLs specified in actor input, exiting...')
15 await Actor.exit()
16
17 default_queue = await Actor.open_request_queue()
18 for start_url in start_urls:
19 url = start_url.get('url')
20 Actor.log.info(f'Enqueuing {url} ...')
21 await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
22
23 while request := await default_queue.fetch_next_request():
24 url = request['url']
25 Actor.log.info(f'Scraping {url} ...')
26 depth = request['userData']['depth']
27
28 try:
29 with urlopen(url) as response:
30 html = response.read()
31 soup = BeautifulSoup(html, 'html.parser')
32
33 if depth < max_depth:
34 for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):
35 model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)
36 brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)
37 refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]
38 price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)
39 condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)
40 deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()
41 link = card.find('a')['href']
42
43 detail_url = urljoin(url, link)
44 with urlopen(detail_url) as detail_response:
45 detail_html = detail_response.read()
46 detail_soup = BeautifulSoup(detail_html, 'html.parser')
47
48 product_details = detail_soup.find('div', class_='product-specifications-accordion')
49
50
51 dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()
52 gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()
53 case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()
54 bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()
55
56
57 data = {
58 'url': detail_url,
59 'brand': brand,
60 'model': model,
61 'referenceNo': refno,
62 'price': price,
63 'deliveryTime': deliveryTime,
64 'condition': condition,
65 'dimensions': dimensions,
66 'gender': gender,
67 'case': case,
68 'bracelet': bracelet
69 }
70
71
72 await Actor.push_data(data)
73 except Exception:
74 Actor.log.exception(f'Cannot extract data from {url}.')
75 finally:
76 await default_queue.mark_request_as_handled(request)