1
2from apify import Actor
3
4import requests
5
6from bs4 import BeautifulSoup
7import time
8import csv
9import re
10
11def convert_to_array_or_string(input_str):
12 if ',' in input_str:
13
14 array_of_strings = [item.strip() for item in input_str.split(',')]
15 return array_of_strings
16 else:
17
18 return input_str
19
20def extract_price_range(price_range_text):
21 pattern = r'\$(\d+\.?\d*) – \$(\d+\.?\d*)'
22 price_range_match = re.search(pattern, price_range_text)
23
24 if price_range_match:
25 min_price = float(price_range_match.group(1))
26 max_price = float(price_range_match.group(2))
27 return min_price, max_price
28 else:
29 return None, None
30
31def scrape_page(url,unwanted_brands):
32 try:
33
34 response = requests.get(url)
35
36 if response.status_code == 200:
37
38 soup = BeautifulSoup(response.content, 'html.parser')
39
40
41 product_data_list = []
42
43 product_containers = soup.find_all('article', {'class': 'ivm_G _PT1R'})
44
45 for container in product_containers:
46 product_data = {}
47
48
49 current_brand_name= container.find('div', {'class': 'KtWqU FKmne Io521'}).text
50
51
52
53 if current_brand_name in unwanted_brands :
54 continue
55 product_data['brand_name'] = current_brand_name
56
57 product_data['product_name'] = container.find('h3', {'class': 'kKGYj TpwNx'}).text.strip()
58
59
60
61
62
63 current_price_elem = container.find('span', {'class': 'qHz0a BkySr EhCiu dls-111m7yq'})
64 if current_price_elem:
65 price_range_text = current_price_elem.text
66 min_price, max_price = extract_price_range(price_range_text)
67 if min_price is not None and max_price is not None:
68 product_data['min_price'] = min_price
69 product_data['max_price'] = max_price
70 else:
71
72 product_data['min_price'] = product_data['max_price'] = float(price_range_text.replace('$', '').replace(',', ''))
73 else:
74 product_data['min_price'] = None
75 product_data['max_price'] = None
76
77
78 current_discount_elem = container.find('span', {'class': 'BkySr EhCiu dls-111m7yq'})
79
80 if current_discount_elem:
81 current_discount_text = current_discount_elem.text
82 pattern1 = r'\(Up to (\d+% off) select items\)'
83
84 pattern2 = r'\((\d+% off)\)'
85 pattern3 = r'\(Up to (\d+% off)\)'
86
87 match1 = re.search(pattern1, current_discount_text)
88 match2 = re.search(pattern2, current_discount_text)
89 match3 = re.search(pattern3, current_discount_text)
90
91 percent = None
92 if match1:
93 percent = match1.group(1)
94 elif match2:
95 percent = match2.group(1)
96 elif match3:
97 percent = match3.group(1)
98
99 product_data['discount'] = percent
100 else:
101 product_data['discount'] = None
102
103
104
105
106 product_link = container.find('a', {'class': 'AFBJb'})['href']
107 product_data['product_link'] = f"https://www.nordstromrack.com{product_link}"
108
109 product_data['image_url'] = container.find('img', {'name': 'product-module-image'})['src']
110
111
112 product_data_list.append(product_data)
113 return product_data_list
114 except Exception as e:
115 print("Error:", e)
116
117
118def scrape_all_products(base_url, total_pages,unwanted_brands):
119 all_products = []
120 for page in range(1, total_pages + 1):
121 page_url = f"{base_url}&page={page}"
122 print(f"Scraping page {page} - {page_url}")
123 products_on_page = scrape_page(page_url,unwanted_brands)
124 all_products.extend(products_on_page)
125
126 time.sleep(1)
127
128 return all_products
129
130
131
132async def main():
133 async with Actor:
134
135 actor_input = await Actor.get_input() or {}
136 url = actor_input.get('url')
137 number = actor_input.get('num')
138
139 unwanted_brands = convert_to_array_or_string(actor_input.get('unwanted'))
140
141 all_p = scrape_all_products(url,number,unwanted_brands)
142
143 await Actor.push_data(all_p)