# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11
FROM apify/actor-python-selenium:3.11


# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-1",
    "title": "Scrape single page in Python",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "meta": {
        "templateId": "python-start"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.apify.com/"
        }
    },
    "required": ["url"]
}

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at
11# https://www.crummy.com/software/BeautifulSoup/bs4/doc
12from bs4 import BeautifulSoup
13
14# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/
15from httpx import AsyncClient
16
17# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python
18from apify import Actor
19
20from selenium import webdriver
21from selenium.webdriver.chrome.options import Options as ChromeOptions
22from selenium.webdriver.common.by import By
23import time
24
25import requests
26
27
28# Function to scrape product details from individual product page
29def scrape_product_details(product_url):
30    response = requests.get(product_url)
31    soup = BeautifulSoup(response.content, 'html.parser')
32
33    # Find the dimensions container
34    dimensions_container = soup.find('div', class_='pip-product-dimensions__dimensions-container')
35    
36    measurements = {}
37    if dimensions_container:
38        
39        # Find all the measurements
40        for measurement in dimensions_container.find_all('p', class_='pip-product-dimensions__measurement-wrapper'):
41            name_tag = measurement.find('span', class_='pip-product-dimensions__measurement-name')
42            if name_tag:
43                # Normalize the key to lowercase and remove the colon
44                key = name_tag.text.replace(":", "").strip().lower()
45                
46                # Extract the value and unit
47                value_with_unit = measurement.text.replace(name_tag.text, "").strip()
48                value, unit = value_with_unit.split()  # Split into value and unit
49                
50                # Convert the value to float
51                value = float(value)
52                
53                # Store the measurement in the dictionary
54                measurements[key] = (value, unit)
55    
56    return measurements
57
58async def main() -> None:
59    """
60    The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
61    out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
62    and it also enhances performance in the field of web scraping significantly.
63    """
64    async with Actor:
65        # Structure of input is defined in input_schema.json
66        actor_input = await Actor.get_input() or {}
67        url = actor_input.get('url')
68
69        # Launch a new Selenium Chrome WebDriver
70        Actor.log.info('Launching Chrome WebDriver...')
71        chrome_options = ChromeOptions()
72        if Actor.config.headless:
73            chrome_options.add_argument('--headless')
74        chrome_options.add_argument('--no-sandbox')
75        chrome_options.add_argument('--disable-dev-shm-usage')
76        driver = webdriver.Chrome(options=chrome_options)
77        driver.get("https://www.ikea.com/nl/en/cat/bookcases-shelving-units-st002/")
78
79        # Allow the page to load completely
80        time.sleep(3)
81
82        reject_cookie = driver.find_element(By.CSS_SELECTOR, "#onetrust-reject-all-handler")
83        reject_cookie.click()
84
85        while True:
86            try:
87                # Find the "Show more" button
88                show_more_button = driver.find_element(By.CSS_SELECTOR, 'a.plp-btn')
89                
90                # Click the "Show more" button
91                show_more_button.click()
92
93                # Wait for new items to load
94                time.sleep(3)
95            except:
96                # Break the loop if no "Show more" button is found (all items loaded)
97                print("No more items to load.")
98                break
99
100        page_source = driver.page_source
101
102        # Create an asynchronous HTTPX client
103        # async with AsyncClient() as client:
104        #     # Fetch the HTML content of the page.
105        #     response = await client.get(url, follow_redirects=True)
106
107        # # Parse the HTML content using Beautiful Soup
108        # soup = BeautifulSoup(response.content, 'html.parser')
109        soup = BeautifulSoup(page_source, 'html.parser')
110
111        product_containers = soup.find_all('div', class_='plp-fragment-wrapper')
112
113        for container in product_containers:
114            product_name = container.find('span', class_='plp-price-module__product-name').text
115            product_price = container.find('div', class_='plp-mastercard')['data-price']
116            product_url = container.find('a', class_='plp-product__image-link')['href']
117            product_photo_url = container.find('img', class_='plp-image plp-product__image')['src']
118
119            # Scrape dimensions from the product page
120            dimensions = scrape_product_details(product_url)
121            
122            # Save headings to Dataset - a table-like storage
123            await Actor.push_data([{"product_name" : product_name, "dimensions" : dimensions,
124                "product_price" : product_price, "product_url" : product_url, "product_photo_url" : product_photo_url}])

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
8selenium ~= 4.14.0
9requests==2.31.0

KR IKEA Scraper

styleindexamerica/kr-ikea-scraper

This actor is intended to extract data from ikea.com/kr/ko/

PopinBorder Castnet

IKEA Product Extractor - PDP

autoscraping/ikea-product-search-by-url

Instantly extract IKEA product data by URL: specs, images, prices, documents & more in JSON. No login, no proxies. Perfect for research & analysis. Pricing: $2.5/1000 results.

AUTOScraping

IKEA Products Bycategory

pintostudio/ikea-products-bycategory

This actor extracts product information from IKEA's online catalog by category.

Pinto Studio

Snapchat Ads Scraper

lexis-solutions/snapchat-ads-scraper

Extract data from ads shown on the Snapchat platform. Download and analyze data from the Snap Ads Gallery for competitive analysis.

Lexis Solutions

3.6

Catawiki Search Scraper 📈✨ - Faster & Cheaper

scrapestorm/catawiki-search-scraper---faster-cheaper

Catawiki Search Scraper 🔍 allows you to extract detailed auction data with customizable filters. Gather insights on auction titles, descriptions, sellers, bidding prices, images, and more for research or marketing! 🌟📊📝 Perfect for analyzing auction trends and popular listings on Catawiki.

Storm_Scraper

5.0

Ikea Similar Product

pintostudio/ikea-similar-product

The IKEA Similar Product Actor is an Apify web scraper that finds similar products to given IKEA product URLs.

Pinto Studio

Ikea Product Description

pintostudio/ikea-product-description

The IKEA Product Description Actor is a web scraping tool that extracts detailed product information from IKEA product pages.

Pinto Studio

Instagram Story Downloader

scraper-mind/instagram-story-downloader

Download Instagram stories from public profiles in seconds! 📥 Our Instagram Story Downloader supports images, videos, JSON output & smart proxy handling. Fast, reliable, and perfect for marketers, creators, and analysts. No coding needed!