Nordstrom Rack Scrape
Deprecated
Pricing
Pay per usage
Go to Store
Nordstrom Rack Scrape
Deprecated
Scrape Nordstrom Rack products, finding the best value one
0.0 (0)
Pricing
Pay per usage
1
Total users
3
Monthly users
3
Last modified
2 years ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com/" }, "num": { "title": "how many pages? ", "type": "integer", "description": "how many pages you want", "editor": "number", "prefill": 1 }, "unwanted": { "title": "list of brands you don't want to see ", "type": "string", "description": "list of brands you don't want", "editor": "textfield", "prefill": " " } }, "required": ["url", "num","unwanted"]}
src/__init__.py
1
src/__main__.py
1import asyncio2import logging3
4from apify.log import ActorLogFormatter5
6from .main import main7
8# Set up logging of messages from the Apify SDK9handler = logging.StreamHandler()10handler.setFormatter(ActorLogFormatter())11
12apify_client_logger = logging.getLogger('apify_client')13apify_client_logger.setLevel(logging.INFO)14apify_client_logger.addHandler(handler)15
16apify_logger = logging.getLogger('apify')17apify_logger.setLevel(logging.DEBUG)18apify_logger.addHandler(handler)19
20asyncio.run(main())
src/main.py
1# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).2from apify import Actor3# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)4import requests5# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)6from bs4 import BeautifulSoup7import time 8import csv9import re10
11def convert_to_array_or_string(input_str):12 if ',' in input_str:13 # If the input string contains a comma, split it into an array of strings14 array_of_strings = [item.strip() for item in input_str.split(',')]15 return array_of_strings16 else:17 # If there's no comma, keep the original string18 return input_str19
20def extract_price_range(price_range_text):21 pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)' 22 price_range_match = re.search(pattern, price_range_text)23 24 if price_range_match:25 min_price = float(price_range_match.group(1))26 max_price = float(price_range_match.group(2))27 return min_price, max_price28 else:29 return None, None30
31def scrape_page(url,unwanted_brands):32 try:33 # Send a GET request to the URL34 response = requests.get(url)35
36 if response.status_code == 200:37 # # Parse the HTML content using BeautifulSoup38 soup = BeautifulSoup(response.content, 'html.parser')39
40 # # Find the product containers (use appropriate HTML tags and attributes)41 product_data_list = []42
43 product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})44
45 for container in product_containers:46 product_data = {}47
48 # Brand name49 current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text50
51 # if current_brand_name in excluded_brands :52 # continue 53 if current_brand_name in unwanted_brands :54 continue 55 product_data['brand_name'] = current_brand_name56 # Product name57 product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()58
59 # Current price and discount60 # product_data['current_price'] = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'}).text.strip()61 # product_data['current_price'] = float(current_price_text.replace('$', '').replace(',', ''))62
63 current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})64 if current_price_elem:65 price_range_text = current_price_elem.text66 min_price, max_price = extract_price_range(price_range_text)67 if min_price is not None and max_price is not None:68 product_data['min_price'] = min_price69 product_data['max_price'] = max_price70 else:71 # If the price range format doesn't match, handle as a single price72 product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))73 else:74 product_data['min_price'] = None75 product_data['max_price'] = None76
77 78 current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})79
80 if current_discount_elem:81 current_discount_text = current_discount_elem.text82 pattern1 = r'\(Up to (\d+% off) select items\)'83
84 pattern2 = r'\((\d+% off)\)'85 pattern3 = r'\(Up to (\d+% off)\)'86
87 match1 = re.search(pattern1, current_discount_text)88 match2 = re.search(pattern2, current_discount_text)89 match3 = re.search(pattern3, current_discount_text)90
91 percent = None 92 if match1:93 percent = match1.group(1)94 elif match2:95 percent = match2.group(1)96 elif match3:97 percent = match3.group(1)98
99 product_data['discount'] = percent100 else: 101 product_data['discount'] = None 102 103 # # Previous price104 # product_data['previous_price'] = container.find('span', {'class': 'fj69a EhCiu dls-111m7yq'}).text105
106 product_link = container.find('a', {'class': 'AFBJb'})['href']107 product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"108 109 product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']110
111
112 product_data_list.append(product_data)113 return product_data_list114 except Exception as e:115 print("Error:", e)116
117# Main function for scraping all products118def scrape_all_products(base_url, total_pages,unwanted_brands):119 all_products = []120 for page in range(1, total_pages + 1):121 page_url = f"{base_url}&page={page}" # Adjust the URL pattern accordingly122 print(f"Scraping page {page} - {page_url}")123 products_on_page = scrape_page(page_url,unwanted_brands)124 all_products.extend(products_on_page)125 # Add a delay between requests (avoid overwhelming the server)126 time.sleep(1)127
128 return all_products129
130
131
132async def main():133 async with Actor:134 # Structure of input is defined in input_schema.json135 actor_input = await Actor.get_input() or {}136 url = actor_input.get('url')137 number = actor_input.get('num')138
139 unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))140 141 all_p = scrape_all_products(url,number,unwanted_brands)142
143 await Actor.push_data(all_p)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache
.scrapy*.log
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/3# for how to format them4apify ~= 1.1.15beautifulsoup4 ~= 4.12.06requests ~= 2.31.0