1"""
2Browser configuration and management for SHEIN scraper
3Supports Smartproxy (Israel) and Apify Proxy for reliable scraping with full captcha bypass.
4"""
5
6import undetected_chromedriver as uc
7from selenium_stealth import stealth
8from selenium.webdriver.common.by import By
9import time
10import json
11import os
12import random
13import tempfile
14import logging
15
16
17logging.getLogger('seleniumwire').setLevel(logging.ERROR)
18logging.getLogger('hpack').setLevel(logging.ERROR)
19logging.getLogger('urllib3').setLevel(logging.ERROR)
20
21
22try:
23 from seleniumwire import webdriver as wire_webdriver
24 SELENIUMWIRE_AVAILABLE = True
25except ImportError:
26 SELENIUMWIRE_AVAILABLE = False
27
28
29
30FINGERPRINT = {
31 "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
32 "platform": "Win32",
33 "vendor": "Google Inc.",
34 "webgl_vendor": "Intel Inc.",
35 "renderer": "Intel Iris OpenGL Engine"
36}
37
38
39
40
41
42SMARTPROXY_SERVER = "proxy.smartproxy.net"
43SMARTPROXY_PORT = "3120"
44
45
46SMARTPROXY_USERNAME_BASE = "smart-v1zmwetcduzk"
47SMARTPROXY_PASSWORD = "PcnWBokjVDGawLA9"
48
49_proxy_initialized = False
50_current_session_id = None
51
52
53def _generate_session_id():
54 """Generate unique session ID for new IP"""
55 import uuid
56 return str(uuid.uuid4())[:8]
57
58
59def _build_proxy_url(session_id=None):
60 """Build Smartproxy URL with optional session for IP rotation"""
61 if session_id:
62
63 username = f"{SMARTPROXY_USERNAME_BASE}_session-{session_id}"
64 else:
65 username = SMARTPROXY_USERNAME_BASE
66 return f"http://{username}:{SMARTPROXY_PASSWORD}@{SMARTPROXY_SERVER}:{SMARTPROXY_PORT}"
67
68
69def is_israel_domain(url):
70 """Check if URL is for Israel SHEIN domain"""
71 return 'il.shein.com' in url.lower() if url else False
72
73
74def is_supported_domain(url):
75 """Check if URL is for a supported SHEIN domain (IL, AR, etc.)"""
76 if not url:
77 return False
78 url_lower = url.lower()
79 supported = ['il.shein.com', 'ar.shein.com']
80 return any(domain in url_lower for domain in supported)
81
82
83def should_use_smartproxy(urls):
84 """
85 Determine if we should use Smartproxy based on URLs
86 Use for supported domains (il.shein.com, ar.shein.com)
87 """
88 if not urls:
89 return False
90 for url in urls:
91 if is_supported_domain(url):
92 return True
93 return False
94
95
96
97should_use_proxies_com = should_use_smartproxy
98
99
100def get_proxies_from_api(proxy_name=None):
101 """Initialize Smartproxy (no API call needed - single endpoint)"""
102 global _proxy_initialized, _current_session_id
103 _proxy_initialized = True
104 _current_session_id = _generate_session_id()
105 return [_build_proxy_url(_current_session_id)]
106
107
108def get_random_proxy(exclude_proxy=None):
109 """Get Smartproxy URL with NEW session ID (new IP)"""
110 global _current_session_id
111 _current_session_id = _generate_session_id()
112 return _build_proxy_url(_current_session_id)
113
114
115def get_next_proxy(current_proxy=None):
116 """Get Smartproxy URL with NEW session ID (forces new IP)"""
117 global _current_session_id
118 _current_session_id = _generate_session_id()
119 return _build_proxy_url(_current_session_id)
120
121
122def create_proxy_auth_extension(proxy_host, proxy_user, proxy_pass):
123 """
124 Create a Chrome extension that handles proxy authentication automatically.
125 This avoids the auth popup that Chrome shows for authenticated proxies.
126
127 Returns:
128 Path to the extension directory
129 """
130 import tempfile
131 import zipfile
132
133
134 if ':' in proxy_host:
135 host, port = proxy_host.rsplit(':', 1)
136 else:
137 host = proxy_host
138 port = "80"
139
140 manifest_json = """
141{
142 "version": "1.0.0",
143 "manifest_version": 2,
144 "name": "Proxy Auth Extension",
145 "permissions": [
146 "proxy",
147 "tabs",
148 "unlimitedStorage",
149 "storage",
150 "<all_urls>",
151 "webRequest",
152 "webRequestBlocking"
153 ],
154 "background": {
155 "scripts": ["background.js"]
156 }
157}
158"""
159
160 background_js = """
161var config = {
162 mode: "fixed_servers",
163 rules: {
164 singleProxy: {
165 scheme: "http",
166 host: "%s",
167 port: parseInt(%s)
168 },
169 bypassList: ["localhost"]
170 }
171};
172
173chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
174
175function callbackFn(details) {
176 return {
177 authCredentials: {
178 username: "%s",
179 password: "%s"
180 }
181 };
182}
183
184chrome.webRequest.onAuthRequired.addListener(
185 callbackFn,
186 {urls: ["<all_urls>"]},
187 ['blocking']
188);
189""" % (host, port, proxy_user, proxy_pass)
190
191
192 ext_dir = tempfile.mkdtemp(prefix="proxy_auth_")
193
194 with open(os.path.join(ext_dir, "manifest.json"), 'w') as f:
195 f.write(manifest_json)
196
197 with open(os.path.join(ext_dir, "background.js"), 'w') as f:
198 f.write(background_js)
199
200 return ext_dir
201
202
203def get_browser(proxy=None, headless=True):
204 """
205 Initialize and return a configured Chrome browser instance
206
207 Args:
208 proxy: Proxy string - can be:
209 - proxies.com format: "http://user:pass@ip:port"
210 - Apify format: "http://user:pass@proxy.apify.com:8000"
211 - Simple format: "ip:port" or "http://ip:port"
212 headless: Run in headless mode (default True for Apify)
213
214 Returns:
215 WebDriver instance
216 """
217
218 proxy_host = None
219 proxy_user = None
220 proxy_pass = None
221
222 if proxy:
223
224
225 if '@' in proxy:
226
227
228 if '://' in proxy:
229 protocol, rest = proxy.split('://', 1)
230 auth_part, host_part = rest.rsplit('@', 1)
231 if ':' in auth_part:
232 proxy_user, proxy_pass = auth_part.split(':', 1)
233 proxy_host = host_part
234 else:
235
236 auth_part, host_part = proxy.rsplit('@', 1)
237 if ':' in auth_part:
238 proxy_user, proxy_pass = auth_part.split(':', 1)
239 proxy_host = host_part
240 pass
241 else:
242
243 if '://' in proxy:
244 proxy_host = proxy.split('://', 1)[1]
245 else:
246 proxy_host = proxy
247
248
249
250 driver = None
251
252 try:
253
254 if proxy_user and proxy_pass and proxy_host:
255 if not SELENIUMWIRE_AVAILABLE:
256 raise RuntimeError("selenium-wire required for authenticated proxy. pip install selenium-wire")
257 from seleniumwire import webdriver as sw
258 from selenium.webdriver.chrome.options import Options
259 from selenium.webdriver.chrome.service import Service
260 import subprocess
261
262 options = Options()
263 options.page_load_strategy = 'eager'
264 options.add_argument("--window-size=1920,1080")
265 options.add_argument(f"user-agent={FINGERPRINT['user_agent']}")
266 options.add_argument("--no-sandbox")
267 options.add_argument("--disable-dev-shm-usage")
268 options.add_argument("--disable-gpu")
269 options.add_argument("--disable-infobars")
270 options.add_argument("--ignore-certificate-errors")
271 options.add_argument("--ignore-ssl-errors")
272 options.add_argument("--blink-settings=imagesEnabled=false")
273
274 options.add_argument("--disable-remote-fonts")
275 options.add_argument("--disable-plugins")
276 options.add_argument("--disable-extensions")
277 options.add_argument("--disable-default-apps")
278
279 options.add_argument("--disable-background-networking")
280 options.add_argument("--disable-background-timer-throttling")
281 options.add_argument("--disable-backgrounding-occluded-windows")
282 options.add_argument("--disable-breakpad")
283 options.add_argument("--disable-component-extensions-with-background-pages")
284 options.add_argument("--disable-features=TranslateUI")
285 options.add_argument("--disable-hang-monitor")
286 options.add_argument("--disable-ipc-flooding-protection")
287 options.add_argument("--disable-renderer-backgrounding")
288 options.add_argument("--disable-sync")
289 options.add_argument("--metrics-recording-only")
290 options.add_argument("--no-first-run")
291 options.add_argument("--safebrowsing-disable-auto-update")
292
293 options.add_argument("--disable-software-rasterizer")
294 options.add_argument("--disable-setuid-sandbox")
295 options.add_argument("--disable-features=VizDisplayCompositor")
296
297 import platform
298 if platform.system() == "Linux":
299 options.add_argument("--single-process")
300 options.add_argument("--memory-pressure-off")
301 if headless:
302 options.add_argument("--headless=new")
303
304
305 proxy_url = f"http://{proxy_user}:{proxy_pass}@{proxy_host}"
306 sw_options = {
307 'proxy': {
308 'http': proxy_url,
309 'https': proxy_url,
310 'no_proxy': 'localhost,127.0.0.1'
311 },
312 'disable_capture': True,
313 'verify_ssl': False,
314 'suppress_connection_errors': True,
315 'connection_timeout': 30,
316 'request_storage_max_size': 0,
317 'disable_encoding': True,
318 }
319
320 driver = sw.Chrome(options=options, seleniumwire_options=sw_options)
321
322 driver.set_page_load_timeout(45)
323 driver.set_script_timeout(20)
324
325
326 else:
327 options = uc.ChromeOptions()
328 options.page_load_strategy = 'eager'
329 options.add_argument("--window-size=1920,1080")
330 options.add_argument(f"user-agent={FINGERPRINT['user_agent']}")
331 options.add_argument("--no-sandbox")
332 options.add_argument("--disable-dev-shm-usage")
333 options.add_argument("--disable-infobars")
334 options.add_argument("--disable-notifications")
335 options.add_argument("--disable-popup-blocking")
336 options.add_argument("--blink-settings=imagesEnabled=false")
337 options.add_argument("--disable-blink-features=AutomationControlled")
338
339 options.add_argument("--disable-background-networking")
340 options.add_argument("--disable-background-timer-throttling")
341 options.add_argument("--disable-breakpad")
342 options.add_argument("--disable-features=TranslateUI")
343 options.add_argument("--disable-hang-monitor")
344 options.add_argument("--disable-sync")
345 options.add_argument("--no-first-run")
346
347 options.add_argument("--disable-software-rasterizer")
348 options.add_argument("--disable-setuid-sandbox")
349
350 import platform
351 if platform.system() == "Linux":
352 options.add_argument("--single-process")
353 options.add_argument("--memory-pressure-off")
354 if headless:
355 options.add_argument("--headless=new")
356 options.add_argument("--disable-gpu")
357 if proxy_host:
358 options.add_argument(f"--proxy-server=http://{proxy_host}")
359
360 driver = uc.Chrome(options=options, version_main=143)
361
362 driver.set_page_load_timeout(45)
363 driver.set_script_timeout(20)
364
365 except Exception as e:
366 raise
367
368
369 try:
370 stealth(driver,
371 languages=["en-US", "en"],
372 vendor=FINGERPRINT["vendor"],
373 platform=FINGERPRINT["platform"],
374 webgl_vendor=FINGERPRINT["webgl_vendor"],
375 renderer=FINGERPRINT["renderer"],
376 fix_hairline=True,
377 webdriver=False
378 )
379 except:
380 pass
381
382
383 try:
384 driver.execute_cdp_cmd('Network.setBlockedURLs', {
385 'urls': [
386 '*.woff', '*.woff2', '*.ttf', '*.otf',
387 '*.mp4', '*.webm', '*.avi',
388 '*google-analytics*', '*googletagmanager*',
389 '*facebook*', '*fbcdn*',
390 '*doubleclick*', '*adsense*',
391 '*.gif',
392 ]
393 })
394 driver.execute_cdp_cmd('Network.enable', {})
395 except:
396 pass
397
398
399 try:
400 driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
401 'source': """
402 // Inject CSS to hide popups immediately on page load
403 const style = document.createElement('style');
404 style.textContent = `
405 .coupon-dialog,
406 .coupon-dialog__coupon-content,
407 .sui-dialog,
408 .popup-overlay,
409 .modal-overlay,
410 .dialog-overlay,
411 [class*="coupon-dialog"],
412 [class*="promotion-dialog"],
413 [role="dialog"] {
414 display: none !important;
415 visibility: hidden !important;
416 opacity: 0 !important;
417 }
418 `;
419 document.head.appendChild(style);
420
421 // Also prevent body scroll lock
422 document.addEventListener('DOMContentLoaded', function() {
423 document.body.style.overflow = 'auto !important';
424 document.body.style.position = 'static !important';
425 });
426 """
427 })
428 except:
429 pass
430
431
432 driver.implicitly_wait(10)
433 driver.set_page_load_timeout(60)
434 driver.set_script_timeout(30)
435
436 return driver
437
438
439def close_browser(driver):
440 """Safely close the browser"""
441 try:
442 driver.quit()
443 except:
444 pass
445
446
447def close_popup(driver):
448 """Close any popups on the page (fast, non-blocking)"""
449 try:
450 driver.execute_script("""
451 const selectors = '.coupon-dialog, .sui-dialog, [role="dialog"], .popup-overlay, .modal-overlay, .she-mask, [class*="coupon"], [class*="modal"], [class*="popup"]';
452 document.querySelectorAll(selectors).forEach(el => {
453 el.style.display = 'none';
454 el.style.visibility = 'hidden';
455 });
456 document.querySelectorAll('.sui-dialog__close, .sui-icon-close, .icon-close, [aria-label="close"], [aria-label="Close"]').forEach(btn => {
457 try { btn.click(); } catch(e) {}
458 });
459 document.body.style.overflow = 'auto';
460 document.body.style.position = 'static';
461 """)
462 return True
463 except:
464 return False
465
466
467def load_cookies(driver, cookies_json=None, cookie_file="cookies.json"):
468 """Load cookies from JSON string or file to establish session."""
469 cookies = None
470
471
472 if cookies_json:
473 try:
474 if isinstance(cookies_json, str):
475 cookies = json.loads(cookies_json)
476 elif isinstance(cookies_json, list):
477 cookies = cookies_json
478 except:
479 pass
480
481
482 if not cookies:
483 possible_paths = [cookie_file, f"/usr/src/app/{cookie_file}", f"./{cookie_file}"]
484 for path in possible_paths:
485 if os.path.exists(path):
486 try:
487 with open(path, 'r') as f:
488 cookies = json.load(f)
489 break
490 except:
491 pass
492
493 if not cookies:
494 return False
495
496
497 driver.get("https://il.shein.com")
498 time.sleep(2)
499
500
501 success_count = 0
502 for cookie in cookies:
503 try:
504 driver.execute_cdp_cmd('Network.setCookie', {
505 'name': cookie.get('name'),
506 'value': cookie.get('value'),
507 'domain': '.shein.com',
508 'path': '/'
509 })
510 success_count += 1
511 except:
512 pass
513
514
515 if success_count > 0:
516 driver.refresh()
517 time.sleep(1)
518
519 return success_count > 0