Nordstrom Rack Scrape avatar

Nordstrom Rack Scrape

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Nordstrom Rack Scrape

Nordstrom Rack Scrape

jessiedev/my-actor

Scrape Nordstrom Rack products, finding the best value one

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Scrape single page in Python",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "url": {
7            "title": "URL of the page",
8            "type": "string",
9            "description": "The URL of website you want to get the data from.",
10            "editor": "textfield",
11            "prefill": "https://www.apify.com/"
12        },
13        "num": {
14            "title": "how many pages? ",
15            "type": "integer",
16            "description": "how many pages you want",
17            "editor": "number",
18            "prefill": 1
19        },
20        "unwanted": {
21            "title": "list of brands you don't want to see ",
22            "type": "string",
23            "description": "list of brands you don't want",
24            "editor": "textfield",
25            "prefill": " " 
26        }
27    },
28    "required": ["url", "num","unwanted"]
29}

src/__init__.py

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2from apify import Actor
3# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
4import requests
5# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
6from bs4 import BeautifulSoup
7import time 
8import csv
9import re
10
11def convert_to_array_or_string(input_str):
12    if ',' in input_str:
13        # If the input string contains a comma, split it into an array of strings
14        array_of_strings = [item.strip() for item in input_str.split(',')]
15        return array_of_strings
16    else:
17        # If there's no comma, keep the original string
18        return input_str
19
20def extract_price_range(price_range_text):
21    pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)' 
22    price_range_match = re.search(pattern, price_range_text)
23 
24    if price_range_match:
25        min_price = float(price_range_match.group(1))
26        max_price = float(price_range_match.group(2))
27        return min_price, max_price
28    else:
29        return None, None
30
31def scrape_page(url,unwanted_brands):
32    try:
33        # Send a GET request to the URL
34        response = requests.get(url)
35
36        if response.status_code == 200:
37            # # Parse the HTML content using BeautifulSoup
38            soup = BeautifulSoup(response.content, 'html.parser')
39
40            # # Find the product containers (use appropriate HTML tags and attributes)
41            product_data_list = []
42
43            product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})
44
45            for container in product_containers:
46                product_data = {}
47
48                # Brand name
49                current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text
50
51                # if current_brand_name in excluded_brands :
52                #     continue            
53                if current_brand_name in unwanted_brands :
54                    continue    
55                product_data['brand_name'] = current_brand_name
56                # Product name
57                product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()
58
59                # Current price and discount
60                # product_data['current_price'] = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'}).text.strip()
61                # product_data['current_price'] = float(current_price_text.replace('$', '').replace(',', ''))
62
63                current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})
64                if current_price_elem:
65                    price_range_text = current_price_elem.text
66                    min_price, max_price = extract_price_range(price_range_text)
67                    if min_price is not None and max_price is not None:
68                        product_data['min_price'] = min_price
69                        product_data['max_price'] = max_price
70                    else:
71                        # If the price range format doesn't match, handle as a single price
72                        product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))
73                else:
74                    product_data['min_price'] = None
75                    product_data['max_price'] = None
76
77          
78                current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})
79
80                if current_discount_elem:
81                    current_discount_text = current_discount_elem.text
82                    pattern1 = r'\(Up to (\d+% off) select items\)'
83
84                    pattern2 = r'\((\d+% off)\)'
85                    pattern3 = r'\(Up to (\d+% off)\)'
86
87                    match1 = re.search(pattern1, current_discount_text)
88                    match2 = re.search(pattern2, current_discount_text)
89                    match3 = re.search(pattern3, current_discount_text)
90
91                    percent = None 
92                    if match1:
93                        percent = match1.group(1)
94                    elif match2:
95                        percent = match2.group(1)
96                    elif match3:
97                        percent = match3.group(1)
98
99                    product_data['discount'] = percent
100                else: 
101                    product_data['discount'] = None 
102                
103                # # Previous price
104                # product_data['previous_price'] = container.find('span', {'class': 'fj69a EhCiu dls-111m7yq'}).text
105
106                product_link = container.find('a', {'class': 'AFBJb'})['href']
107                product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"
108                
109                product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']
110
111
112                product_data_list.append(product_data)
113            return product_data_list
114    except Exception as e:
115        print("Error:", e)
116
117# Main function for scraping all products
118def scrape_all_products(base_url, total_pages,unwanted_brands):
119    all_products = []
120    for page in range(1, total_pages + 1):
121        page_url = f"{base_url}&page={page}"  # Adjust the URL pattern accordingly
122        print(f"Scraping page {page} - {page_url}")
123        products_on_page = scrape_page(page_url,unwanted_brands)
124        all_products.extend(products_on_page)
125        # Add a delay between requests (avoid overwhelming the server)
126        time.sleep(1)
127
128    return all_products
129
130
131
132async def main():
133    async with Actor:
134        # Structure of input is defined in input_schema.json
135        actor_input = await Actor.get_input() or {}
136        url = actor_input.get('url')
137        number = actor_input.get('num')
138
139        unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))
140        
141        all_p = scrape_all_products(url,number,unwanted_brands)
142
143        await Actor.push_data(all_p)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28
29.scrapy
30*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0
Developer
Maintained by Community
Categories