# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11

# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Scrape single page in Python",
    "description": "Scrape data from single page with provided URL.",
    "version": "0.0",
    "meta": {
        "templateId": "python-start"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.apify.com/"
        },
        "num": {
            "title": "how many pages? ",
            "type": "integer",
            "description": "how many pages you want",
            "editor": "number",
            "prefill": 1
        },
        "unwanted": {
            "title": "list of brands you don't want to see ",
            "type": "string",
            "description": "list of brands you don't want",
            "editor": "textfield",
            "prefill": " " 
        }
    },
    "required": ["url", "num","unwanted"]
}

src/init.py

src/main.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2from apify import Actor
3# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
4import requests
5# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
6from bs4 import BeautifulSoup
7import time 
8import csv
9import re
10
11def convert_to_array_or_string(input_str):
12    if ',' in input_str:
13        # If the input string contains a comma, split it into an array of strings
14        array_of_strings = [item.strip() for item in input_str.split(',')]
15        return array_of_strings
16    else:
17        # If there's no comma, keep the original string
18        return input_str
19
20def extract_price_range(price_range_text):
21    pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)' 
22    price_range_match = re.search(pattern, price_range_text)
23 
24    if price_range_match:
25        min_price = float(price_range_match.group(1))
26        max_price = float(price_range_match.group(2))
27        return min_price, max_price
28    else:
29        return None, None
30
31def scrape_page(url,unwanted_brands):
32    try:
33        # Send a GET request to the URL
34        response = requests.get(url)
35
36        if response.status_code == 200:
37            # # Parse the HTML content using BeautifulSoup
38            soup = BeautifulSoup(response.content, 'html.parser')
39
40            # # Find the product containers (use appropriate HTML tags and attributes)
41            product_data_list = []
42
43            product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})
44
45            for container in product_containers:
46                product_data = {}
47
48                # Brand name
49                current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text
50
51                # if current_brand_name in excluded_brands :
52                #     continue            
53                if current_brand_name in unwanted_brands :
54                    continue    
55                product_data['brand_name'] = current_brand_name
56                # Product name
57                product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()
58
59                # Current price and discount
60                # product_data['current_price'] = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'}).text.strip()
61                # product_data['current_price'] = float(current_price_text.replace('$', '').replace(',', ''))
62
63                current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})
64                if current_price_elem:
65                    price_range_text = current_price_elem.text
66                    min_price, max_price = extract_price_range(price_range_text)
67                    if min_price is not None and max_price is not None:
68                        product_data['min_price'] = min_price
69                        product_data['max_price'] = max_price
70                    else:
71                        # If the price range format doesn't match, handle as a single price
72                        product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))
73                else:
74                    product_data['min_price'] = None
75                    product_data['max_price'] = None
76
77          
78                current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})
79
80                if current_discount_elem:
81                    current_discount_text = current_discount_elem.text
82                    pattern1 = r'\(Up to (\d+% off) select items\)'
83
84                    pattern2 = r'\((\d+% off)\)'
85                    pattern3 = r'\(Up to (\d+% off)\)'
86
87                    match1 = re.search(pattern1, current_discount_text)
88                    match2 = re.search(pattern2, current_discount_text)
89                    match3 = re.search(pattern3, current_discount_text)
90
91                    percent = None 
92                    if match1:
93                        percent = match1.group(1)
94                    elif match2:
95                        percent = match2.group(1)
96                    elif match3:
97                        percent = match3.group(1)
98
99                    product_data['discount'] = percent
100                else: 
101                    product_data['discount'] = None 
102                
103                # # Previous price
104                # product_data['previous_price'] = container.find('span', {'class': 'fj69a EhCiu dls-111m7yq'}).text
105
106                product_link = container.find('a', {'class': 'AFBJb'})['href']
107                product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"
108                
109                product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']
110
111
112                product_data_list.append(product_data)
113            return product_data_list
114    except Exception as e:
115        print("Error:", e)
116
117# Main function for scraping all products
118def scrape_all_products(base_url, total_pages,unwanted_brands):
119    all_products = []
120    for page in range(1, total_pages + 1):
121        page_url = f"{base_url}&page={page}"  # Adjust the URL pattern accordingly
122        print(f"Scraping page {page} - {page_url}")
123        products_on_page = scrape_page(page_url,unwanted_brands)
124        all_products.extend(products_on_page)
125        # Add a delay between requests (avoid overwhelming the server)
126        time.sleep(1)
127
128    return all_products
129
130
131
132async def main():
133    async with Actor:
134        # Structure of input is defined in input_schema.json
135        actor_input = await Actor.get_input() or {}
136        url = actor_input.get('url')
137        number = actor_input.get('num')
138
139        unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))
140        
141        all_p = scrape_all_products(url,number,unwanted_brands)
142
143        await Actor.push_data(all_p)

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache

.scrapy
*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0

Nordstrom Product Bulk Scraper Pro

hello.datawizard-owner/nordstrom-product-bulk-scraper-pro

Nordstrom Product Bulk Scraper Pro: Easily scrape Nordstrom product listings by keyword. Extract details like name, brand, price, ratings, and stock. Ideal for e-commerce analysis and trend tracking. Use Apify Proxy with Residential group for reliable JSON results. Perfect for market research.

datawizards

Nordstrom Scraper

runtime/nordstrom-scraper

Nordstrom Scraper is an Apify Actor that scrapes product data from Nordstrom’s search results pages. It extracts key product details such as title, brand, image URL, current price, and previous price.

Runtime

5.0

Pinterest Board Scraper

runtime/pinterest-board-scraper

Pinterest Board Scraper is an Apify Actor that extracts comprehensive data from Pinterest boards. It scrapes pin information including images, descriptions, repins, dates, and external links. Perfect for designers, marketers, and content creators who need to analyze Pinterest content.

Runtime

5.0

Gumtree.com(.au) Scraper | Richest Output | ($2 / 1K)

memo23/gumtree-cheerio

Extract detailed listings including pricing history, seller info, vehicle specs (VIN, mileage, condition), electronics details, and furniture dimensions. Get rich media (images, videos), location data, and competitor analysis.

Muhamed Didovic

Kickscrew Product Search

pintostudio/kickscrew-product-search

The KICKS CREW Product Search Actor is a powerful web scraping tool designed to search and extract product information from the KICKS CREW sneaker marketplace.

Pinto Studio

Nordstrom Scraper

trudax/actor-nordstrom-scraper

Nordstrom web scraper to crawl product information including price and sale price, color, and images. Extract all data in a dataset in multiple formats.

Gustavo Rudiger

214

Google Maps Extractor

compass/google-maps-extractor

Extract data from hundreds of places fast. Scrape Google Maps by keyword, category, location, URLs & other filters. Get addresses, contact info, opening hours, popular times, prices, menus & more. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Compass

48K

4.4

📩📍 Google Maps Email Extractor

lukaskrivka/google-maps-with-contact-details

Extract Google Maps contact details. Scrape websites of Google Maps places for contact details and get email addresses, website, location, address, zipcode, phone number, social media links. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Lukáš Křivka

32K

3.9

Tweet Scraper|$0.25/1K Tweets | Pay-Per Result | No Rate Limits

kaitoeasyapi/twitter-x-data-tweet-scraper-pay-per-result-cheapest

Only $0.25/1000 tweets for Twitter scraping, 100% reliability, swift data retrieval.This incredible low price is almost too good to be true.Thanks to our large-scale operations and efficient servers, we can offer you rock-bottom prices that no competitors can match. Don't miss this opportunity !

twlo low

3.2K

3.9

Indeed Scraper

misceres/indeed-scraper

Scrape jobs posted on Indeed. Get detailed information from this job portal about saved and sponsored jobs. Specify the search based on location with the output attributes position, location, and description.