6thstreet Selenium Link Scraper
Under maintenance
Pricing
Pay per usage
Go to Store
6thstreet Selenium Link Scraper
Under maintenance
0.0 (0)
Pricing
Pay per usage
0
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
19 days ago
.gitignore
storagenode_modules.venv
dockerfile
# Use the official Apify Python base imageFROM apify/actor-python-selenium:latest
# Copy everything into the containerCOPY . ./
# Install Python dependenciesRUN pip install --no-cache-dir -r requirements.txt
# Run the main Python scriptCMD ["python", "scraper.py"]
requirements.txt
1selenium2apify-client3webdriver-manager
scraper.py
1import asyncio2import time3from selenium import webdriver4from selenium.webdriver.chrome.options import Options5from selenium.webdriver.common.by import By6from selenium.webdriver.support.ui import WebDriverWait7from selenium.webdriver.support import expected_conditions as EC8from apify import Actor9
10def init_driver():11 chrome_options = Options()12 chrome_options.add_argument("--headless")13 chrome_options.add_argument("--disable-dev-shm-usage")14 chrome_options.add_argument("--no-sandbox")15 return webdriver.Chrome(options=chrome_options)16
17def extract_products(driver, category, max_results):18 products = []19 items = driver.find_elements(By.CSS_SELECTOR, "li.ProductItem")20 print(f"[INFO] Found {len(items)} raw items")21
22 for item in items:23 if len(products) >= max_results:24 break25 try:26 brand = item.find_element(By.CLASS_NAME, "ProductItem-Brand").text27 title = item.find_element(By.CLASS_NAME, "ProductItem-Title").text28 price = item.find_element(By.CLASS_NAME, "Price").text29 image = item.find_element(By.CSS_SELECTOR, "img.Image-Image").get_attribute("src")30 product_url = item.find_element(By.CSS_SELECTOR, "a.ProductItem-ImgBlock").get_attribute("href")31
32 products.append({33 "brand": brand,34 "product_name": title,35 "price": price,36 "image": image,37 "product_url": product_url,38 "category": category,39 })40 except Exception as e:41 print(f"[WARN] Skipping product due to error: {e}")42 return products43
44def click_all_load_more(driver, wait, max_results):45 step = 146 while True:47 try:48 items = driver.find_elements(By.CSS_SELECTOR, "li.ProductItem")49 print(f"[INFO] current number of products: {len(items)}")50 if len(items) >= max_results:51 print(f"[INFO] Reached max results limit ({max_results})")52 break53 print(f"[INFO] Clicking 'Load More' button, step {step}")54 load_more_btn = wait.until(55 EC.element_to_be_clickable((By.CSS_SELECTOR, ".LoadMore button"))56 )57 driver.execute_script("arguments[0].click();", load_more_btn)58 time.sleep(2)59 step += 160 except Exception:61 print("[INFO] No more 'Load More' button. Finished loading products.")62 break63
64async def main():65 async with Actor:66 input_data = await Actor.get_input()67 start_urls = input_data.get("startUrls", {}) 68 max_results = input_data.get("maxResults", 30)69
70
71 for category, url in start_urls.items():72 print(f"[INFO] Scraping category: {category} - {url}")73 driver = init_driver()74 driver.get(url)75 wait = WebDriverWait(driver, 10)76
77 click_all_load_more(driver, wait, max_results)78 products = extract_products(driver, category, max_results)79
80 print(f"[INFO] Extracted {len(products)} products for category: {category}")81 for product in products:82 await Actor.push_data(product)83
84 driver.quit()85
86 print("[DONE] Scraping complete.")87
88if __name__ == "__main__":89 asyncio.run(main())
.actor/actor.json
{ "actorSpecification": 1, "name": "6thstreet-selenium-link-scraper", "version": "1.0", "title": "6thStreet Scraper", "input": "input_schema.json"}
.actor/input_schema.json
{ "title": "Scraper input", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs by Category", "type": "object", "editor": "json", "description": "A dictionary of categories with their corresponding URLs to scrape."
}, "maxResults": { "title": "Max Results", "type": "integer", "description": "The maximum number of products to extract per category.", "default": 100, "minimum": 1 } }, "required": ["startUrls"]}