Nordstrom Rack Scrape avatar
Nordstrom Rack Scrape

Deprecated

Pricing

Pay per usage

Go to Store
Nordstrom Rack Scrape

Nordstrom Rack Scrape

Deprecated

Developed by

Jessie Zhou

Jessie Zhou

Maintained by Community

Scrape Nordstrom Rack products, finding the best value one

0.0 (0)

Pricing

Pay per usage

1

Total users

3

Monthly users

3

Last modified

2 years ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Specify how to launch the source code of your actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor",
"title": "Scrape single page in Python",
"description": "Scrape data from single page with provided URL.",
"version": "0.0",
"meta": {
"templateId": "python-start"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Scrape data from a web page",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"title": "URL of the page",
"type": "string",
"description": "The URL of website you want to get the data from.",
"editor": "textfield",
"prefill": "https://www.apify.com/"
},
"num": {
"title": "how many pages? ",
"type": "integer",
"description": "how many pages you want",
"editor": "number",
"prefill": 1
},
"unwanted": {
"title": "list of brands you don't want to see ",
"type": "string",
"description": "list of brands you don't want",
"editor": "textfield",
"prefill": " "
}
},
"required": ["url", "num","unwanted"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())

src/main.py

1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2from apify import Actor
3# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
4import requests
5# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
6from bs4 import BeautifulSoup
7import time
8import csv
9import re
10
11def convert_to_array_or_string(input_str):
12 if ',' in input_str:
13 # If the input string contains a comma, split it into an array of strings
14 array_of_strings = [item.strip() for item in input_str.split(',')]
15 return array_of_strings
16 else:
17 # If there's no comma, keep the original string
18 return input_str
19
20def extract_price_range(price_range_text):
21 pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)'
22 price_range_match = re.search(pattern, price_range_text)
23
24 if price_range_match:
25 min_price = float(price_range_match.group(1))
26 max_price = float(price_range_match.group(2))
27 return min_price, max_price
28 else:
29 return None, None
30
31def scrape_page(url,unwanted_brands):
32 try:
33 # Send a GET request to the URL
34 response = requests.get(url)
35
36 if response.status_code == 200:
37 # # Parse the HTML content using BeautifulSoup
38 soup = BeautifulSoup(response.content, 'html.parser')
39
40 # # Find the product containers (use appropriate HTML tags and attributes)
41 product_data_list = []
42
43 product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})
44
45 for container in product_containers:
46 product_data = {}
47
48 # Brand name
49 current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text
50
51 # if current_brand_name in excluded_brands :
52 # continue
53 if current_brand_name in unwanted_brands :
54 continue
55 product_data['brand_name'] = current_brand_name
56 # Product name
57 product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()
58
59 # Current price and discount
60 # product_data['current_price'] = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'}).text.strip()
61 # product_data['current_price'] = float(current_price_text.replace('$', '').replace(',', ''))
62
63 current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})
64 if current_price_elem:
65 price_range_text = current_price_elem.text
66 min_price, max_price = extract_price_range(price_range_text)
67 if min_price is not None and max_price is not None:
68 product_data['min_price'] = min_price
69 product_data['max_price'] = max_price
70 else:
71 # If the price range format doesn't match, handle as a single price
72 product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))
73 else:
74 product_data['min_price'] = None
75 product_data['max_price'] = None
76
77
78 current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})
79
80 if current_discount_elem:
81 current_discount_text = current_discount_elem.text
82 pattern1 = r'\(Up to (\d+% off) select items\)'
83
84 pattern2 = r'\((\d+% off)\)'
85 pattern3 = r'\(Up to (\d+% off)\)'
86
87 match1 = re.search(pattern1, current_discount_text)
88 match2 = re.search(pattern2, current_discount_text)
89 match3 = re.search(pattern3, current_discount_text)
90
91 percent = None
92 if match1:
93 percent = match1.group(1)
94 elif match2:
95 percent = match2.group(1)
96 elif match3:
97 percent = match3.group(1)
98
99 product_data['discount'] = percent
100 else:
101 product_data['discount'] = None
102
103 # # Previous price
104 # product_data['previous_price'] = container.find('span', {'class': 'fj69a EhCiu dls-111m7yq'}).text
105
106 product_link = container.find('a', {'class': 'AFBJb'})['href']
107 product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"
108
109 product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']
110
111
112 product_data_list.append(product_data)
113 return product_data_list
114 except Exception as e:
115 print("Error:", e)
116
117# Main function for scraping all products
118def scrape_all_products(base_url, total_pages,unwanted_brands):
119 all_products = []
120 for page in range(1, total_pages + 1):
121 page_url = f"{base_url}&page={page}" # Adjust the URL pattern accordingly
122 print(f"Scraping page {page} - {page_url}")
123 products_on_page = scrape_page(page_url,unwanted_brands)
124 all_products.extend(products_on_page)
125 # Add a delay between requests (avoid overwhelming the server)
126 time.sleep(1)
127
128 return all_products
129
130
131
132async def main():
133 async with Actor:
134 # Structure of input is defined in input_schema.json
135 actor_input = await Actor.get_input() or {}
136 url = actor_input.get('url')
137 number = actor_input.get('num')
138
139 unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))
140
141 all_p = scrape_all_products(url,number,unwanted_brands)
142
143 await Actor.push_data(all_p)

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.scrapy
*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0