IKEA scraper avatar

IKEA scraper

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
IKEA scraper

IKEA scraper

akaya/ikea-scraper

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5FROM apify/actor-python-selenium:3.11
6
7
8# Second, copy just requirements.txt into the Actor image,
9# since it should be the only file that affects the dependency install in the next step,
10# in order to speed up the build
11COPY requirements.txt ./
12
13# Install the packages specified in requirements.txt,
14# Print the installed Python version, pip version
15# and all installed packages with their versions for debugging
16RUN echo "Python version:" \
17 && python --version \
18 && echo "Pip version:" \
19 && pip --version \
20 && echo "Installing dependencies:" \
21 && pip install -r requirements.txt \
22 && echo "All installed Python packages:" \
23 && pip freeze
24
25# Next, copy the remaining files and directories with the source code.
26# Since we do this after installing the dependencies, quick build will be really fast
27# for most source file changes.
28COPY . ./
29
30# Use compileall to ensure the runnability of the Actor Python code.
31RUN python3 -m compileall -q .
32
33# Specify how to launch the source code of your Actor.
34# By default, the "python3 -m src" command is run
35CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Scrape single page in Python",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "url": {
7            "title": "URL of the page",
8            "type": "string",
9            "description": "The URL of website you want to get the data from.",
10            "editor": "textfield",
11            "prefill": "https://www.apify.com/"
12        }
13    },
14    "required": ["url"]
15}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at
11# https://www.crummy.com/software/BeautifulSoup/bs4/doc
12from bs4 import BeautifulSoup
13
14# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/
15from httpx import AsyncClient
16
17# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python
18from apify import Actor
19
20from selenium import webdriver
21from selenium.webdriver.chrome.options import Options as ChromeOptions
22from selenium.webdriver.common.by import By
23import time
24
25import requests
26
27
28# Function to scrape product details from individual product page
29def scrape_product_details(product_url):
30    response = requests.get(product_url)
31    soup = BeautifulSoup(response.content, 'html.parser')
32
33    # Find the dimensions container
34    dimensions_container = soup.find('div', class_='pip-product-dimensions__dimensions-container')
35    
36    measurements = {}
37    if dimensions_container:
38        
39        # Find all the measurements
40        for measurement in dimensions_container.find_all('p', class_='pip-product-dimensions__measurement-wrapper'):
41            name_tag = measurement.find('span', class_='pip-product-dimensions__measurement-name')
42            if name_tag:
43                # Normalize the key to lowercase and remove the colon
44                key = name_tag.text.replace(":", "").strip().lower()
45                
46                # Extract the value and unit
47                value_with_unit = measurement.text.replace(name_tag.text, "").strip()
48                value, unit = value_with_unit.split()  # Split into value and unit
49                
50                # Convert the value to float
51                value = float(value)
52                
53                # Store the measurement in the dictionary
54                measurements[key] = (value, unit)
55    
56    return measurements
57
58async def main() -> None:
59    """
60    The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
61    out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
62    and it also enhances performance in the field of web scraping significantly.
63    """
64    async with Actor:
65        # Structure of input is defined in input_schema.json
66        actor_input = await Actor.get_input() or {}
67        url = actor_input.get('url')
68
69        # Launch a new Selenium Chrome WebDriver
70        Actor.log.info('Launching Chrome WebDriver...')
71        chrome_options = ChromeOptions()
72        if Actor.config.headless:
73            chrome_options.add_argument('--headless')
74        chrome_options.add_argument('--no-sandbox')
75        chrome_options.add_argument('--disable-dev-shm-usage')
76        driver = webdriver.Chrome(options=chrome_options)
77        driver.get("https://www.ikea.com/nl/en/cat/bookcases-shelving-units-st002/")
78
79        # Allow the page to load completely
80        time.sleep(3)
81
82        reject_cookie = driver.find_element(By.CSS_SELECTOR, "#onetrust-reject-all-handler")
83        reject_cookie.click()
84
85        while True:
86            try:
87                # Find the "Show more" button
88                show_more_button = driver.find_element(By.CSS_SELECTOR, 'a.plp-btn')
89                
90                # Click the "Show more" button
91                show_more_button.click()
92
93                # Wait for new items to load
94                time.sleep(3)
95            except:
96                # Break the loop if no "Show more" button is found (all items loaded)
97                print("No more items to load.")
98                break
99
100        page_source = driver.page_source
101
102        # Create an asynchronous HTTPX client
103        # async with AsyncClient() as client:
104        #     # Fetch the HTML content of the page.
105        #     response = await client.get(url, follow_redirects=True)
106
107        # # Parse the HTML content using Beautiful Soup
108        # soup = BeautifulSoup(response.content, 'html.parser')
109        soup = BeautifulSoup(page_source, 'html.parser')
110
111        product_containers = soup.find_all('div', class_='plp-fragment-wrapper')
112
113        for container in product_containers:
114            product_name = container.find('span', class_='plp-price-module__product-name').text
115            product_price = container.find('div', class_='plp-mastercard')['data-price']
116            product_url = container.find('a', class_='plp-product__image-link')['href']
117            product_photo_url = container.find('img', class_='plp-image plp-product__image')['src']
118
119            # Scrape dimensions from the product page
120            dimensions = scrape_product_details(product_url)
121            
122            # Save headings to Dataset - a table-like storage
123            await Actor.push_data([{"product_name" : product_name, "dimensions" : dimensions,
124                "product_price" : product_price, "product_url" : product_url, "product_photo_url" : product_photo_url}])

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28.ruff_cache
29
30.scrapy
31*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
8selenium ~= 4.14.0
9requests==2.31.0
Developer
Maintained by Community