1import { Actor } from 'apify';
2import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee';
3import { chromium } from 'rebrowser-playwright';
4import { InputSchema, ValidatedInput, EtsyProduct } from './types.js';
5import { HumanBehavior } from './human-behavior.js';
6import { DataDomeSolver } from './datadome-solver.js';
7
8
9
10
11
12
13
14
15
16
17
18class EtsyScraper {
19 private input: ValidatedInput;
20 private itemCount = 0;
21 private sessionWarmedUp = false;
22 private dataDomeSolver: DataDomeSolver;
23
24 constructor(input: ValidatedInput) {
25 this.input = input;
26
27 const apiKey = process.env.CAPSOLVER_API_KEY || '';
28 this.dataDomeSolver = new DataDomeSolver(apiKey);
29 }
30
31 async run(): Promise<void> {
32 const proxyConfiguration = await Actor.createProxyConfiguration({
33 groups: ['RESIDENTIAL'],
34 countryCode: 'US',
35 });
36
37 console.log(`🔗 Using US residential proxies\n`);
38
39 const router = createPlaywrightRouter();
40
41
42 router.addHandler('SEARCH', async ({ page, request, crawler, proxyInfo }) => {
43
44 let searchQuery = this.input.query || '';
45 if (this.input.searchUrl) {
46 const urlMatch = this.input.searchUrl.match(/[?&]q=([^&]+)/);
47 if (urlMatch) {
48 searchQuery = decodeURIComponent(urlMatch[1].replace(/\+/g, ' '));
49 }
50 }
51
52 if (!searchQuery) {
53 console.log(' ❌ No search query specified');
54 throw new Error('No search query');
55 }
56
57 console.log(`🔍 Will search for: "${searchQuery}"`);
58
59
60 try {
61 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
62 } catch (e) {
63 console.log(' ⚠️ Page load timeout');
64 }
65
66
67 const homepageHtml = await page.content();
68 await Actor.setValue('homepage.html', homepageHtml, { contentType: 'text/html' });
69 console.log(` 📄 Homepage loaded, HTML saved (${homepageHtml.length} chars)`);
70
71
72 let isBlocked = await this.dataDomeSolver.isBlocked(page);
73 if (isBlocked) {
74 console.log(' ⚠️ Challenge detected on homepage');
75 const solved = await this.dataDomeSolver.solveDataDome(page, proxyInfo);
76 if (!solved) {
77 throw new Error('Homepage blocked');
78 }
79 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
80 }
81
82
83 const humanBehavior = new HumanBehavior(page);
84 await humanBehavior.initialize();
85
86 console.log(' 🔥 Simulating human behavior on homepage...');
87 await this.naturalDelay(2000, 3000);
88 await humanBehavior.naturalScroll(2);
89 await humanBehavior.randomMouseMovements(3);
90 await this.naturalDelay(1000, 2000);
91
92
93 console.log(' 📝 Looking for search input...');
94 const searchSelectors = [
95 'input#global-enhancements-search-query',
96 'input[name="search_query"]',
97 'input[placeholder*="Search"]',
98 'input[type="search"]',
99 '#search-query',
100 '.wt-input-btn-group input',
101 ];
102
103 let searchInput = null;
104 for (const selector of searchSelectors) {
105 searchInput = await page.$(selector);
106 if (searchInput) {
107 const isVisible = await searchInput.isVisible().catch(() => false);
108 if (isVisible) {
109 console.log(` ✅ Found search input: ${selector}`);
110 break;
111 }
112 }
113 searchInput = null;
114 }
115
116 if (!searchInput) {
117
118 const inputs = await page.$$eval('input', (els: any[]) =>
119 els.map((el: any) => ({ id: el.id, name: el.name, type: el.type, placeholder: el.placeholder }))
120 );
121 console.log(' Available inputs:', JSON.stringify(inputs.slice(0, 10)));
122 throw new Error('Search input not found');
123 }
124
125
126 await searchInput.click();
127 await this.naturalDelay(300, 600);
128
129
130 console.log(` ⌨️ Typing: "${searchQuery}"`);
131 for (const char of searchQuery) {
132 await page.keyboard.type(char, { delay: 50 + Math.random() * 100 });
133 }
134 await this.naturalDelay(500, 1000);
135
136
137 console.log(' ⏎ Pressing Enter...');
138 await page.keyboard.press('Enter');
139
140
141 try {
142 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
143 await this.naturalDelay(2000, 3000);
144 } catch (e) {
145 console.log(' ⚠️ Timeout waiting for search results');
146 }
147
148
149 await humanBehavior.naturalScroll(2);
150 await humanBehavior.randomMouseMovements(2);
151 await this.naturalDelay(1000, 2000);
152
153
154 isBlocked = await this.dataDomeSolver.isBlocked(page);
155 if (isBlocked) {
156 console.log(' ⚠️ Challenge detected on search results');
157 const solved = await this.dataDomeSolver.solveDataDome(page, proxyInfo);
158 if (!solved) {
159 throw new Error('Search results blocked');
160 }
161 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
162 }
163
164
165 const pageTitle = await page.title();
166 const listingCount = await page.$$eval('[data-palette-listing-id]', (els: any[]) => els.length);
167 console.log(` 📄 Page: "${pageTitle}" | Found ${listingCount} listing elements`);
168
169
170 const html = await page.content();
171 await Actor.setValue('search-results.html', html, { contentType: 'text/html' });
172
173
174 const products = await page.evaluate(() => {
175 const items: any[] = [];
176 const jsonLdMap: Record<string, string> = {};
177
178
179 try {
180 const scripts = document.querySelectorAll('script[type="application/ld+json"]');
181 scripts.forEach(script => {
182 try {
183 const data = JSON.parse(script.textContent || '{}');
184 if (data['@type'] === 'ItemList' && Array.isArray(data.itemListElement)) {
185 data.itemListElement.forEach((item: any) => {
186 const product = item.item;
187 if (product && product.url && product.brand && product.brand.name) {
188
189 jsonLdMap[product.url] = product.brand.name;
190
191
192 const idMatch = product.url.match(/\/listing\/(\d+)/);
193 if (idMatch) {
194 jsonLdMap[idMatch[1]] = product.brand.name;
195 }
196 }
197 });
198 }
199 } catch (e) {
200
201 }
202 });
203 } catch (e) {
204 console.error('Error parsing JSON-LD:', e);
205 }
206
207
208 const listings = document.querySelectorAll('[data-palette-listing-id]');
209
210 listings.forEach((listing: any) => {
211 try {
212
213 const link = listing.querySelector('a[href*="/listing/"]');
214 if (!link) return;
215
216 const href = link.getAttribute('href') || '';
217 const productId = href.match(/\/listing\/(\d+)/)?.[1] || '';
218 const url = href.startsWith('http') ? href : `https://www.etsy.com${href}`;
219
220
221 const titleEl = listing.querySelector('h3') ||
222 listing.querySelector('[data-listing-card-title]') ||
223 listing.querySelector('h2');
224 const title = titleEl?.textContent?.trim() || '';
225
226
227 let price = 0;
228 const priceContainer = listing.querySelector('[data-selector="listing-price"]') ||
229 listing.querySelector('[class*="currency-value"]') ||
230 listing.querySelector('span[class*="price"]');
231
232 if (priceContainer) {
233 const priceText = priceContainer.textContent || '';
234 const match = priceText.match(/[\d,]+\.?\d*/);
235 if (match) {
236 price = parseFloat(match[0].replace(/,/g, ''));
237 }
238 }
239
240
241 let reviewCount = 0;
242 const reviewEl = listing.querySelector('[aria-label*="star"]') ||
243 listing.querySelector('[data-reviews-count]');
244
245 if (reviewEl) {
246 const reviewText = reviewEl.getAttribute('aria-label') || reviewEl.textContent || '';
247 const match = reviewText.match(/(\d+(?:,\d+)?)\s*(?:reviews?|stars?)/i);
248 if (match) {
249 reviewCount = parseInt(match[1].replace(/,/g, ''));
250 }
251 }
252
253
254 let rating = 0;
255
256
257 const ratingEl = listing.querySelector('[data-rating]');
258 if (ratingEl) {
259 const ratingValue = ratingEl.getAttribute('data-rating');
260 rating = parseFloat(ratingValue || '0');
261 }
262
263
264 if (rating === 0 && reviewEl) {
265 const ariaLabel = reviewEl.getAttribute('aria-label') || '';
266 const match = ariaLabel.match(/([\d.]+)\s*out of 5/i) || ariaLabel.match(/([\d.]+)\s*stars?/i);
267 if (match) {
268 rating = parseFloat(match[1]);
269 }
270 }
271
272
273 if (rating === 0 && reviewCount > 0) {
274 const textEls = listing.querySelectorAll('span, p, div');
275 for (const el of textEls) {
276 const text = el.textContent || '';
277 const match = text.match(/([\d.]+)\s*\(\s*\d+\s*\)/);
278 if (match) {
279 const potentialRating = parseFloat(match[1]);
280 if (potentialRating >= 0 && potentialRating <= 5) {
281 rating = potentialRating;
282 break;
283 }
284 }
285 }
286 }
287
288
289 let shopName = '';
290 let shopUrl = '';
291
292
293 if (productId && jsonLdMap[productId]) {
294 shopName = jsonLdMap[productId];
295 } else if (url && jsonLdMap[url]) {
296 shopName = jsonLdMap[url];
297 }
298
299
300 if (!shopName) {
301 const shopLink = listing.querySelector('a[href*="/shop/"]');
302 if (shopLink) {
303 shopName = shopLink.textContent?.trim() || '';
304 const shopHref = shopLink.getAttribute('href') || '';
305 shopUrl = shopHref ? (shopHref.startsWith('http') ? shopHref : `https://www.etsy.com${shopHref}`) : '';
306 }
307 }
308
309
310
311 if (!shopName) {
312 const screenReaderSpans = listing.querySelectorAll('span[class*="screen-reader-only"], span.wt-screen-reader-only');
313 for (const span of screenReaderSpans) {
314 const text = span.textContent || '';
315 const match = text.match(/From shop\s+(.+)/i);
316 if (match) {
317 shopName = match[1].trim();
318 break;
319 }
320 }
321 }
322
323
324 if (!shopName) {
325 const shopNameEl = listing.querySelector('[data-shop-name]');
326 if (shopNameEl) {
327 shopName = shopNameEl.getAttribute('data-shop-name') || shopNameEl.textContent?.trim() || '';
328 }
329 }
330
331
332 if (!shopName) {
333 const allLinks = listing.querySelectorAll('a');
334 for (const link of allLinks) {
335 const href = link.getAttribute('href') || '';
336 if (href.includes('/shop/')) {
337 shopName = link.textContent?.trim() || '';
338 shopUrl = href.startsWith('http') ? href : `https://www.etsy.com${href}`;
339 break;
340 }
341 }
342 }
343
344
345 if (shopName && !shopUrl) {
346 shopUrl = `https://www.etsy.com/shop/${shopName.replace(/\s+/g, '')}`;
347 }
348
349
350 const imgEl = listing.querySelector('img');
351 let imageUrl = '';
352 if (imgEl) {
353 imageUrl = imgEl.getAttribute('src') ||
354 imgEl.getAttribute('data-src') ||
355 imgEl.getAttribute('data-lazy-src') || '';
356 }
357
358 if (productId && title) {
359 items.push({
360 productId,
361 title,
362 url,
363 price,
364 rating,
365 reviewCount,
366 shopName: shopName || 'N/A',
367 shopUrl: shopUrl || '',
368 imageUrl,
369 scrapedAt: new Date().toISOString(),
370 });
371 }
372 } catch (e) {
373
374 console.error('Error extracting product:', e);
375 }
376 });
377
378 return items;
379 });
380
381 console.log(`📋 Extracted ${products.length} products from search results`);
382
383
384 const remaining = this.input.maxItems - this.itemCount;
385 const productsToSave = products.slice(0, remaining);
386
387 for (const product of productsToSave) {
388 if (this.passesFilters(product)) {
389 console.log(` ✅ "${product.title.substring(0, 50)}..." | $${product.price} | ⭐${product.rating}`);
390 await this.pushProduct(product);
391 this.itemCount++;
392 } else {
393 console.log(` ⏭️ Filtered out: ${product.title.substring(0, 30)}`);
394 }
395 }
396 });
397
398
399 router.addHandler('PRODUCT', async ({ page, request, proxyInfo }) => {
400 console.log(`📦 Scraping product: ${request.url}`);
401
402
403 const humanBehavior = new HumanBehavior(page);
404 await humanBehavior.initialize();
405
406 try {
407 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
408 await this.naturalDelay(1000, 2000);
409 } catch (e) {
410 console.log(' ⚠️ Timeout waiting for page load');
411 }
412
413
414 await humanBehavior.naturalScroll(2);
415
416
417 const isBlocked = await this.dataDomeSolver.isBlocked(page);
418 if (isBlocked) {
419 const solved = await this.dataDomeSolver.solveDataDome(page, proxyInfo);
420 if (!solved) {
421 throw new Error('Page blocked');
422 }
423 await page.waitForLoadState('domcontentloaded', { timeout: 30000 });
424 }
425
426 const product = await page.evaluate(() => {
427 try {
428
429 const title = document.querySelector('h1')?.textContent?.trim() || '';
430 const productId = window.location.href.match(/\/listing\/(\d+)/)?.[1] || '';
431
432
433 let price = 0;
434 const priceEl = document.querySelector('[data-selector="price-only"]') ||
435 document.querySelector('.wt-text-title-03') ||
436 document.querySelector('[class*="price"]');
437 if (priceEl) {
438 const match = priceEl.textContent?.match(/[\d,]+\.?\d*/);
439 if (match) price = parseFloat(match[0].replace(/,/g, ''));
440 }
441
442
443
444 let shopName = '';
445 let shopUrl = '';
446
447
448 const userSelector = '#product_details_content_toggle > div > div:nth-child(1) > ul > div > li > div.wt-ml-xs-1.how-its-made-label-product-details > a';
449 const userShopEl = document.querySelector(userSelector);
450
451 if (userShopEl) {
452 shopName = userShopEl.textContent?.trim() || '';
453 shopUrl = userShopEl.getAttribute('href') || '';
454 }
455
456
457 if (!shopName) {
458 const shopHeader = document.querySelector('a[href*="/shop/"]');
459 if (shopHeader) {
460 shopName = shopHeader.textContent?.trim() || '';
461 shopUrl = shopHeader.getAttribute('href') || '';
462 }
463 }
464
465
466 let rating = 0;
467 let reviewCount = 0;
468 const reviewsBadge = document.querySelector('#reviews-link') || document.querySelector('a[href="#reviews"]');
469 if (reviewsBadge) {
470 const text = reviewsBadge.textContent || '';
471 const countMatch = text.match(/(\d+)/);
472 if (countMatch) reviewCount = parseInt(countMatch[1]);
473
474
475 const stars = document.querySelector('input[name="rating"]');
476 if (stars) rating = parseFloat(stars.getAttribute('value') || '0');
477 }
478
479 const imageUrl = document.querySelector('img.wt-image')?.getAttribute('src') || '';
480
481 if (shopUrl && !shopUrl.startsWith('http')) {
482 shopUrl = `https://www.etsy.com${shopUrl}`;
483 }
484
485 return {
486 productId,
487 title,
488 url: window.location.href,
489 price,
490 rating,
491 reviewCount,
492 shopName: shopName || 'N/A',
493 shopUrl: shopUrl || '',
494 imageUrl,
495 scrapedAt: new Date().toISOString(),
496 } as EtsyProduct;
497 } catch (e) {
498 return null;
499 }
500 });
501
502 if (product && product.title) {
503 console.log(` ✅ "${product.title.substring(0, 50)}..." | Shop: ${product.shopName}`);
504 await this.pushProduct(product);
505 this.itemCount++;
506 } else {
507 console.log(' ❌ Failed to extract product details');
508 }
509 });
510
511 const crawler = new PlaywrightCrawler({
512 proxyConfiguration,
513 requestHandlerTimeoutSecs: 120,
514 useSessionPool: true,
515 persistCookiesPerSession: true,
516 requestHandler: router,
517
518
519
520 minConcurrency: 1,
521 maxConcurrency: 1,
522 maxRequestsPerMinute: 12,
523
524
525
526 browserPoolOptions: {
527 useFingerprints: false,
528 },
529
530 sessionPoolOptions: {
531 blockedStatusCodes: [],
532 maxPoolSize: 10,
533 },
534
535 launchContext: {
536
537
538 launcher: chromium,
539 launchOptions: {
540 headless: false,
541 },
542 },
543
544 preNavigationHooks: [
545 async ({ request, page }, gotoOptions) => {
546
547
548
549
550 await page.setViewportSize({
551 width: 1920,
552 height: 1080
553 });
554
555
556 await page.route('**/*', (route: any) => {
557 const request = route.request();
558 const resourceType = request.resourceType();
559 const url = request.url();
560
561
562 if (['image', 'media', 'font'].includes(resourceType)) {
563 return route.abort();
564 }
565
566
567 if (url.includes('google-analytics') || url.includes('facebook.net') || url.includes('doubleclick')) {
568 return route.abort();
569 }
570
571
572 return route.continue();
573 });
574
575 gotoOptions.waitUntil = 'domcontentloaded';
576 gotoOptions.timeout = 60000;
577 },
578 ],
579 });
580
581 const startUrls = this.generateStartUrls();
582 console.log(`🚀 Starting scraper with ${startUrls.length} URLs\n`);
583
584 await crawler.run(startUrls.map(url => {
585
586 let label = 'SEARCH';
587 if (url.includes('/listing/')) {
588 label = 'PRODUCT';
589 } else if (url.includes('/shop/')) {
590 label = 'SHOP';
591 } else if (url.includes('/c/')) {
592 label = 'CATEGORY';
593 }
594 return { url, label };
595 }));
596
597 console.log(`\n✅ Complete! Scraped ${this.itemCount} products`);
598 }
599
600 private generateStartUrls(): string[] {
601 const urls: string[] = [];
602
603
604
605 if (this.input.query || this.input.searchUrl) {
606 urls.push('https://www.etsy.com');
607 }
608
609 if (this.input.categoryUrl) urls.push(this.input.categoryUrl);
610 if (this.input.shopUrl) urls.push(this.input.shopUrl);
611 if (this.input.productUrls) urls.push(...this.input.productUrls);
612
613
614 if (urls.length === 0) {
615 urls.push('https://www.etsy.com');
616 }
617
618 return urls;
619 }
620
621
622
623
624
625 private async warmupSession(page: any): Promise<void> {
626 console.log('🔥 Warming up session (anti-DataDome strategy)...');
627
628 try {
629
630 await page.goto('https://www.etsy.com', {
631 waitUntil: 'domcontentloaded',
632 timeout: 30000
633 });
634
635
636 const humanBehavior = new HumanBehavior(page);
637 await humanBehavior.initialize();
638
639
640 await this.naturalDelay(2000, 3000);
641
642
643 await humanBehavior.naturalScroll(3);
644 await humanBehavior.randomMouseMovements(4);
645 await humanBehavior.readPageContent(4000);
646
647 console.log(' ✅ Session warmed up successfully');
648 } catch (error: any) {
649 console.log(` ⚠️ Session warmup failed: ${error.message}`);
650
651 }
652 }
653
654
655
656
657
658 private async naturalDelay(minMs: number, maxMs: number): Promise<void> {
659
660 const mean = (minMs + maxMs) / 2;
661 const stdDev = (maxMs - minMs) / 6;
662
663 const u1 = Math.random();
664 const u2 = Math.random();
665 const z0 = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
666
667 let delay = mean + z0 * stdDev;
668 delay = Math.max(minMs, Math.min(maxMs, delay));
669
670 await new Promise(resolve => setTimeout(resolve, delay));
671 }
672
673 private passesFilters(product: EtsyProduct): boolean {
674 if (this.input.minRating && product.rating < this.input.minRating) return false;
675 if (this.input.minReviews && product.reviewCount < this.input.minReviews) return false;
676 if (this.input.priceMin && product.price < this.input.priceMin) return false;
677 if (this.input.priceMax && product.price > this.input.priceMax) return false;
678 return true;
679 }
680
681 private async pushProduct(product: EtsyProduct): Promise<void> {
682 try {
683 await Actor.pushData(product, 'product-scraped');
684 } catch (error: any) {
685 if (error.message?.includes('price')) {
686 await Actor.pushData(product);
687 } else {
688 throw error;
689 }
690 }
691 }
692}
693
694Actor.main(async () => {
695 console.log('🚀 Starting Etsy Scraper Pro...\n');
696
697 const rawInput = await Actor.getInput();
698 let input: ValidatedInput;
699
700 try {
701 input = InputSchema.parse(rawInput || {});
702 } catch (error: any) {
703 console.error('❌ Invalid input:', error.message);
704 throw new Error(`Invalid input: ${error.message}`);
705 }
706
707 const scraper = new EtsyScraper(input);
708 await scraper.run();
709
710 console.log('\n🎉 Actor finished successfully!');
711});