Nordstrom Rack Scrape
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsNordstrom Rack Scrape
jessiedev/my-actor
Scrape Nordstrom Rack products, finding the best value one
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Scrape single page in Python",
5 "description": "Scrape data from single page with provided URL.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "python-start"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Scrape data from a web page",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "url": {
7 "title": "URL of the page",
8 "type": "string",
9 "description": "The URL of website you want to get the data from.",
10 "editor": "textfield",
11 "prefill": "https://www.apify.com/"
12 },
13 "num": {
14 "title": "how many pages? ",
15 "type": "integer",
16 "description": "how many pages you want",
17 "editor": "number",
18 "prefill": 1
19 },
20 "unwanted": {
21 "title": "list of brands you don't want to see ",
22 "type": "string",
23 "description": "list of brands you don't want",
24 "editor": "textfield",
25 "prefill": " "
26 }
27 },
28 "required": ["url", "num","unwanted"]
29}
src/__init__.py
src/__main__.py
1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8# Set up logging of messages from the Apify SDK
9handler = logging.StreamHandler()
10handler.setFormatter(ActorLogFormatter())
11
12apify_client_logger = logging.getLogger('apify_client')
13apify_client_logger.setLevel(logging.INFO)
14apify_client_logger.addHandler(handler)
15
16apify_logger = logging.getLogger('apify')
17apify_logger.setLevel(logging.DEBUG)
18apify_logger.addHandler(handler)
19
20asyncio.run(main())
src/main.py
1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
2from apify import Actor
3# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
4import requests
5# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
6from bs4 import BeautifulSoup
7import time
8import csv
9import re
10
11def convert_to_array_or_string(input_str):
12 if ',' in input_str:
13 # If the input string contains a comma, split it into an array of strings
14 array_of_strings = [item.strip() for item in input_str.split(',')]
15 return array_of_strings
16 else:
17 # If there's no comma, keep the original string
18 return input_str
19
20def extract_price_range(price_range_text):
21 pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)'
22 price_range_match = re.search(pattern, price_range_text)
23
24 if price_range_match:
25 min_price = float(price_range_match.group(1))
26 max_price = float(price_range_match.group(2))
27 return min_price, max_price
28 else:
29 return None, None
30
31def scrape_page(url,unwanted_brands):
32 try:
33 # Send a GET request to the URL
34 response = requests.get(url)
35
36 if response.status_code == 200:
37 # # Parse the HTML content using BeautifulSoup
38 soup = BeautifulSoup(response.content, 'html.parser')
39
40 # # Find the product containers (use appropriate HTML tags and attributes)
41 product_data_list = []
42
43 product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})
44
45 for container in product_containers:
46 product_data = {}
47
48 # Brand name
49 current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text
50
51 # if current_brand_name in excluded_brands :
52 # continue
53 if current_brand_name in unwanted_brands :
54 continue
55 product_data['brand_name'] = current_brand_name
56 # Product name
57 product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()
58
59 # Current price and discount
60 # product_data['current_price'] = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'}).text.strip()
61 # product_data['current_price'] = float(current_price_text.replace('$', '').replace(',', ''))
62
63 current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})
64 if current_price_elem:
65 price_range_text = current_price_elem.text
66 min_price, max_price = extract_price_range(price_range_text)
67 if min_price is not None and max_price is not None:
68 product_data['min_price'] = min_price
69 product_data['max_price'] = max_price
70 else:
71 # If the price range format doesn't match, handle as a single price
72 product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))
73 else:
74 product_data['min_price'] = None
75 product_data['max_price'] = None
76
77
78 current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})
79
80 if current_discount_elem:
81 current_discount_text = current_discount_elem.text
82 pattern1 = r'\(Up to (\d+% off) select items\)'
83
84 pattern2 = r'\((\d+% off)\)'
85 pattern3 = r'\(Up to (\d+% off)\)'
86
87 match1 = re.search(pattern1, current_discount_text)
88 match2 = re.search(pattern2, current_discount_text)
89 match3 = re.search(pattern3, current_discount_text)
90
91 percent = None
92 if match1:
93 percent = match1.group(1)
94 elif match2:
95 percent = match2.group(1)
96 elif match3:
97 percent = match3.group(1)
98
99 product_data['discount'] = percent
100 else:
101 product_data['discount'] = None
102
103 # # Previous price
104 # product_data['previous_price'] = container.find('span', {'class': 'fj69a EhCiu dls-111m7yq'}).text
105
106 product_link = container.find('a', {'class': 'AFBJb'})['href']
107 product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"
108
109 product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']
110
111
112 product_data_list.append(product_data)
113 return product_data_list
114 except Exception as e:
115 print("Error:", e)
116
117# Main function for scraping all products
118def scrape_all_products(base_url, total_pages,unwanted_brands):
119 all_products = []
120 for page in range(1, total_pages + 1):
121 page_url = f"{base_url}&page={page}" # Adjust the URL pattern accordingly
122 print(f"Scraping page {page} - {page_url}")
123 products_on_page = scrape_page(page_url,unwanted_brands)
124 all_products.extend(products_on_page)
125 # Add a delay between requests (avoid overwhelming the server)
126 time.sleep(1)
127
128 return all_products
129
130
131
132async def main():
133 async with Actor:
134 # Structure of input is defined in input_schema.json
135 actor_input = await Actor.get_input() or {}
136 url = actor_input.get('url')
137 number = actor_input.get('num')
138
139 unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))
140
141 all_p = scrape_all_products(url,number,unwanted_brands)
142
143 await Actor.push_data(all_p)
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14.venv/
15.env/
16__pypackages__
17dist/
18build/
19*.egg-info/
20*.egg
21
22__pycache__
23
24.mypy_cache
25.dmypy.json
26dmypy.json
27.pytest_cache
28
29.scrapy
30*.log
requirements.txt
1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5beautifulsoup4 ~= 4.12.0
6requests ~= 2.31.0
Developer
Maintained by Community
Categories