1import { PlaywrightCrawler, Dataset, createPlaywrightRouter } from '@crawlee/playwright';
2import { Actor } from 'apify';
3
4await Actor.init();
5
6const { searchQuery, maxResults = 1, targetLanguage = 'en', lingoApiKey } = (await Actor.getInput()) ?? {};
7
8if (!searchQuery) throw new Error('searchQuery is required');
9
10
11let lingoDotDev = null;
12if (targetLanguage !== 'en' && lingoApiKey) {
13 const { LingoDotDevEngine } = await import('lingo.dev/sdk');
14 lingoDotDev = new LingoDotDevEngine({ apiKey: lingoApiKey });
15}
16
17
18async function translateText(text, targetLang) {
19 if (!text || targetLang === 'en' || !lingoDotDev) return text;
20
21 try {
22 const result = await lingoDotDev.localizeText(
23 { text },
24 { sourceLocale: 'en', targetLocale: targetLang }
25 );
26 return result.text || text;
27 } catch (error) {
28 console.error('Translation error:', error.message);
29 return text;
30 }
31}
32
33
34const TRUSTED_DOMAINS = [
35 'amazon.com', 'sephora.com', 'ulta.com', 'target.com', 'walmart.com',
36 'dermstore.com', 'skinstore.com', 'cultbeauty.com', 'lookfantastic.com',
37 'yesstyle.com', 'sokoglam.com', 'beautylish.com', 'nordstrom.com',
38 'macys.com', 'bloomingdales.com', 'bergdorfgoodman.com', 'saksfifthavenue.com',
39 'thefaceshop.com', 'innisfree.com', 'etudehouse.com', 'laneige.com',
40 'cosrx.com', 'paula'
41];
42
43const productUrls = [];
44const router = createPlaywrightRouter();
45
46router.addHandler('GOOGLE', async ({ page, log }) => {
47 log.info('Parsing Google search results...');
48
49 await page.waitForSelector('div#search', { timeout: 10000 }).catch(() => {});
50
51 const links = await page.$$eval('div#search a[href^="http"]', (anchors) => {
52 const blocked = [
53 'google.com', 'youtube.com', 'youtu.be', 'reddit.com', 'quora.com',
54 'instagram.com', 'tiktok.com', 'facebook.com', 'twitter.com', 'x.com',
55 'pinterest.com', 'linkedin.com', 'tumblr.com', 'snapchat.com',
56 'wikipedia.org', 'wikihow.com', 'forum.', 'community.'
57 ];
58 return anchors
59 .map(a => a.href)
60 .filter(href => href && !blocked.some(domain => href.includes(domain)));
61 });
62
63 const unique = [...new Set(links)];
64 log.info(`Found ${unique.length} URLs`);
65
66
67 const trusted = unique.filter(url =>
68 TRUSTED_DOMAINS.some(domain => url.toLowerCase().includes(domain))
69 );
70 const others = unique.filter(url =>
71 !TRUSTED_DOMAINS.some(domain => url.toLowerCase().includes(domain))
72 );
73
74
75 const prioritized = [...trusted, ...others];
76 log.info(`Prioritized: ${trusted.length} trusted, ${others.length} others`);
77
78 productUrls.push(...prioritized.slice(0, maxResults));
79});
80
81router.addHandler('PRODUCT', async ({ request, page, log }) => {
82 log.info(`Scraping: ${request.url}`);
83
84 await page.waitForLoadState('domcontentloaded');
85
86 const title = await page.title();
87 const description = await page.$eval('meta[name="description"]', el => el.content).catch(() => '');
88 const bodyText = await page.$eval('body', el => el.innerText.toLowerCase()).catch(() => '');
89
90 let ingredients = '';
91 const ingredientEl = await page.$('[class*="ingredient"], [id*="ingredient"]').catch(() => null);
92 if (ingredientEl) {
93 ingredients = await ingredientEl.innerText().catch(() => '');
94 ingredients = ingredients.substring(0, 800);
95 }
96
97 let howToUse = '';
98 const howToUseSelectors = [
99 '[class*="how-to-use"]', '[class*="howToUse"]', '[class*="usage"]',
100 '[class*="directions"]', '[id*="how-to-use"]', '[id*="usage"]', '[id*="directions"]'
101 ];
102 for (const selector of howToUseSelectors) {
103 const el = await page.$(selector).catch(() => null);
104 if (el) {
105 const text = await el.innerText().catch(() => '');
106 if (text && text.length > 20 && text.length < 1000) {
107 howToUse = text.substring(0, 500);
108 break;
109 }
110 }
111 }
112
113 const skinTypes = ['oily', 'dry', 'combination', 'sensitive', 'normal', 'all skin types']
114 .filter(k => bodyText.includes(k));
115
116 const benefits = ['hydrating', 'moisturizing', 'brightening', 'anti-aging', 'soothing', 'calming', 'firming', 'nourishing', 'refreshing']
117 .filter(k => bodyText.includes(k));
118
119 const priceMatch = bodyText.match(/\$[\d,.]+/);
120 const price = priceMatch ? priceMatch[0] : '';
121
122
123
124 let translatedDescription = description;
125 let translatedHowToUse = howToUse;
126
127 if (targetLanguage !== 'en' && lingoDotDev) {
128 log.info(`Translating to ${targetLanguage}...`);
129 [translatedDescription, translatedHowToUse] = await Promise.all([
130 translateText(description, targetLanguage),
131 translateText(howToUse, targetLanguage)
132 ]);
133 }
134
135 await Dataset.pushData({
136 url: request.url,
137 title: title.substring(0, 200),
138 description: translatedDescription.substring(0, 500),
139 descriptionOriginal: description.substring(0, 500),
140 ingredients,
141 howToUse: translatedHowToUse,
142 howToUseOriginal: howToUse,
143 skinTypes: [...new Set(skinTypes)],
144 benefits: [...new Set(benefits)],
145 price,
146 targetLanguage,
147 scrapedAt: new Date().toISOString()
148 });
149
150 log.info(`Done: ${title.substring(0, 40)}...`);
151});
152
153const proxyConfiguration = await Actor.createProxyConfiguration().catch(() => null);
154
155const crawler = new PlaywrightCrawler({
156 proxyConfiguration,
157 requestHandlerTimeoutSecs: 30,
158 navigationTimeoutSecs: 15,
159 maxRequestRetries: 1,
160 requestHandler: router,
161 headless: true,
162});
163
164const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(searchQuery + ' skincare')}`;
165await crawler.run([{ url: searchUrl, label: 'GOOGLE' }]);
166
167if (productUrls.length > 0) {
168 await crawler.run(productUrls.map(url => ({ url, label: 'PRODUCT' })));
169}
170
171const dataset = await Dataset.open();
172const { items } = await dataset.getData();
173
174await Actor.setValue('OUTPUT', {
175 searchQuery,
176 targetLanguage,
177 totalResults: items.length,
178 products: items
179});
180
181await Actor.exit();