IKEA scraper
Deprecated
Pricing
Pay per usage
Go to Store
IKEA scraper
Deprecated
0.0 (0)
Pricing
Pay per usage
1
Total users
2
Monthly users
2
Last modified
10 months ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-1", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com/" } }, "required": ["url"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1"""2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.3
4Feel free to modify this file to suit your specific needs.5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:7https://docs.apify.com/sdk/python8"""9
10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at11# https://www.crummy.com/software/BeautifulSoup/bs4/doc12from bs4 import BeautifulSoup13
14# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/15from httpx import AsyncClient16
17# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python18from apify import Actor19
20from selenium import webdriver21from selenium.webdriver.chrome.options import Options as ChromeOptions22from selenium.webdriver.common.by import By23import time24
25import requests26
27
28# Function to scrape product details from individual product page29def scrape_product_details(product_url):30 response = requests.get(product_url)31 soup = BeautifulSoup(response.content, 'html.parser')32
33 # Find the dimensions container34 dimensions_container = soup.find('div', class_='pip-product-dimensions__dimensions-container')35 36 measurements = {}37 if dimensions_container:38 39 # Find all the measurements40 for measurement in dimensions_container.find_all('p', class_='pip-product-dimensions__measurement-wrapper'):41 name_tag = measurement.find('span', class_='pip-product-dimensions__measurement-name')42 if name_tag:43 # Normalize the key to lowercase and remove the colon44 key = name_tag.text.replace(":", "").strip().lower()45 46 # Extract the value and unit47 value_with_unit = measurement.text.replace(name_tag.text, "").strip()48 value, unit = value_with_unit.split() # Split into value and unit49 50 # Convert the value to float51 value = float(value)52 53 # Store the measurement in the dictionary54 measurements[key] = (value, unit)55 56 return measurements57
58async def main() -> None:59 """60 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function61 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,62 and it also enhances performance in the field of web scraping significantly.63 """64 async with Actor:65 # Structure of input is defined in input_schema.json66 actor_input = await Actor.get_input() or {}67 url = actor_input.get('url')68
69 # Launch a new Selenium Chrome WebDriver70 Actor.log.info('Launching Chrome WebDriver...')71 chrome_options = ChromeOptions()72 if Actor.config.headless:73 chrome_options.add_argument('--headless')74 chrome_options.add_argument('--no-sandbox')75 chrome_options.add_argument('--disable-dev-shm-usage')76 driver = webdriver.Chrome(options=chrome_options)77 driver.get("https://www.ikea.com/nl/en/cat/bookcases-shelving-units-st002/")78
79 # Allow the page to load completely80 time.sleep(3)81
82 reject_cookie = driver.find_element(By.CSS_SELECTOR, "#onetrust-reject-all-handler")83 reject_cookie.click()84
85 while True:86 try:87 # Find the "Show more" button88 show_more_button = driver.find_element(By.CSS_SELECTOR, 'a.plp-btn')89 90 # Click the "Show more" button91 show_more_button.click()92
93 # Wait for new items to load94 time.sleep(3)95 except:96 # Break the loop if no "Show more" button is found (all items loaded)97 print("No more items to load.")98 break99
100 page_source = driver.page_source101
102 # Create an asynchronous HTTPX client103 # async with AsyncClient() as client:104 # # Fetch the HTML content of the page.105 # response = await client.get(url, follow_redirects=True)106
107 # # Parse the HTML content using Beautiful Soup108 # soup = BeautifulSoup(response.content, 'html.parser')109 soup = BeautifulSoup(page_source, 'html.parser')110
111 product_containers = soup.find_all('div', class_='plp-fragment-wrapper')112
113 for container in product_containers:114 product_name = container.find('span', class_='plp-price-module__product-name').text115 product_price = container.find('div', class_='plp-mastercard')['data-price']116 product_url = container.find('a', class_='plp-product__image-link')['href']117 product_photo_url = container.find('img', class_='plp-image plp-product__image')['src']118
119 # Scrape dimensions from the product page120 dimensions = scrape_product_details(product_url)121 122 # Save headings to Dataset - a table-like storage123 await Actor.push_data([{"product_name" : product_name, "dimensions" : dimensions,124 "product_price" : product_price, "product_url" : product_url, "product_photo_url" : product_photo_url}])
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify ~= 1.7.05beautifulsoup4 ~= 4.12.26httpx ~= 0.25.27types-beautifulsoup4 ~= 4.12.0.78selenium ~= 4.14.09requests==2.31.0