1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10
11
12from bs4 import BeautifulSoup
13
14
15from httpx import AsyncClient
16
17
18from apify import Actor
19
20from selenium import webdriver
21from selenium.webdriver.chrome.options import Options as ChromeOptions
22from selenium.webdriver.common.by import By
23import time
24
25import requests
26
27
28
29def scrape_product_details(product_url):
30 response = requests.get(product_url)
31 soup = BeautifulSoup(response.content, 'html.parser')
32
33
34 dimensions_container = soup.find('div', class_='pip-product-dimensions__dimensions-container')
35
36 measurements = {}
37 if dimensions_container:
38
39
40 for measurement in dimensions_container.find_all('p', class_='pip-product-dimensions__measurement-wrapper'):
41 name_tag = measurement.find('span', class_='pip-product-dimensions__measurement-name')
42 if name_tag:
43
44 key = name_tag.text.replace(":", "").strip().lower()
45
46
47 value_with_unit = measurement.text.replace(name_tag.text, "").strip()
48 value, unit = value_with_unit.split()
49
50
51 value = float(value)
52
53
54 measurements[key] = (value, unit)
55
56 return measurements
57
58async def main() -> None:
59 """
60 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
61 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
62 and it also enhances performance in the field of web scraping significantly.
63 """
64 async with Actor:
65
66 actor_input = await Actor.get_input() or {}
67 url = actor_input.get('url')
68
69
70 Actor.log.info('Launching Chrome WebDriver...')
71 chrome_options = ChromeOptions()
72 if Actor.config.headless:
73 chrome_options.add_argument('--headless')
74 chrome_options.add_argument('--no-sandbox')
75 chrome_options.add_argument('--disable-dev-shm-usage')
76 driver = webdriver.Chrome(options=chrome_options)
77 driver.get("https://www.ikea.com/nl/en/cat/bookcases-shelving-units-st002/")
78
79
80 time.sleep(3)
81
82 reject_cookie = driver.find_element(By.CSS_SELECTOR, "#onetrust-reject-all-handler")
83 reject_cookie.click()
84
85 while True:
86 try:
87
88 show_more_button = driver.find_element(By.CSS_SELECTOR, 'a.plp-btn')
89
90
91 show_more_button.click()
92
93
94 time.sleep(3)
95 except:
96
97 print("No more items to load.")
98 break
99
100 page_source = driver.page_source
101
102
103
104
105
106
107
108
109 soup = BeautifulSoup(page_source, 'html.parser')
110
111 product_containers = soup.find_all('div', class_='plp-fragment-wrapper')
112
113 for container in product_containers:
114 product_name = container.find('span', class_='plp-price-module__product-name').text
115 product_price = container.find('div', class_='plp-mastercard')['data-price']
116 product_url = container.find('a', class_='plp-product__image-link')['href']
117 product_photo_url = container.find('img', class_='plp-image plp-product__image')['src']
118
119
120 dimensions = scrape_product_details(product_url)
121
122
123 await Actor.push_data([{"product_name" : product_name, "dimensions" : dimensions,
124 "product_price" : product_price, "product_url" : product_url, "product_photo_url" : product_photo_url}])