1import asyncio
2import time
3from selenium import webdriver
4from selenium.webdriver.chrome.options import Options
5from selenium.webdriver.common.by import By
6from selenium.webdriver.support.ui import WebDriverWait
7from selenium.webdriver.support import expected_conditions as EC
8from apify import Actor
9
10def init_driver():
11 chrome_options = Options()
12 chrome_options.add_argument("--headless")
13 chrome_options.add_argument("--disable-dev-shm-usage")
14 chrome_options.add_argument("--no-sandbox")
15 return webdriver.Chrome(options=chrome_options)
16
17def extract_products(driver, category, max_results):
18 products = []
19 items = driver.find_elements(By.CSS_SELECTOR, "li.ProductItem")
20 print(f"[INFO] Found {len(items)} raw items")
21
22 for item in items:
23 if len(products) >= max_results:
24 break
25 try:
26 brand = item.find_element(By.CLASS_NAME, "ProductItem-Brand").text
27 title = item.find_element(By.CLASS_NAME, "ProductItem-Title").text
28 price = item.find_element(By.CLASS_NAME, "Price").text
29 image = item.find_element(By.CSS_SELECTOR, "img.Image-Image").get_attribute("src")
30 product_url = item.find_element(By.CSS_SELECTOR, "a.ProductItem-ImgBlock").get_attribute("href")
31
32 products.append({
33 "brand": brand,
34 "product_name": title,
35 "price": price,
36 "image": image,
37 "product_url": product_url,
38 "category": category,
39 })
40 except Exception as e:
41 print(f"[WARN] Skipping product due to error: {e}")
42 return products
43
44def click_all_load_more(driver, wait, max_results):
45 step = 1
46 while True:
47 try:
48 items = driver.find_elements(By.CSS_SELECTOR, "li.ProductItem")
49 print(f"[INFO] current number of products: {len(items)}")
50 if len(items) >= max_results:
51 print(f"[INFO] Reached max results limit ({max_results})")
52 break
53 print(f"[INFO] Clicking 'Load More' button, step {step}")
54 load_more_btn = wait.until(
55 EC.element_to_be_clickable((By.CSS_SELECTOR, ".LoadMore button"))
56 )
57 driver.execute_script("arguments[0].click();", load_more_btn)
58 time.sleep(2)
59 step += 1
60 except Exception:
61 print("[INFO] No more 'Load More' button. Finished loading products.")
62 break
63
64async def main():
65 async with Actor:
66 input_data = await Actor.get_input()
67 start_urls = input_data.get("startUrls", {})
68 max_results = input_data.get("maxResults", 30)
69
70
71 for category, url in start_urls.items():
72 print(f"[INFO] Scraping category: {category} - {url}")
73 driver = init_driver()
74 driver.get(url)
75 wait = WebDriverWait(driver, 10)
76
77 click_all_load_more(driver, wait, max_results)
78 products = extract_products(driver, category, max_results)
79
80 print(f"[INFO] Extracted {len(products)} products for category: {category}")
81 for product in products:
82 await Actor.push_data(product)
83
84 driver.quit()
85
86 print("[DONE] Scraping complete.")
87
88if __name__ == "__main__":
89 asyncio.run(main())