1
2
3
4import { log } from 'crawlee';
5
6
7
8
9export async function preparePage(page, userAgent) {
10 try {
11 await page.setUserAgent(userAgent);
12 await page.setViewport({ width: 1366, height: 768 });
13
14
15 await page.setRequestInterception(true);
16 page.on('request', (req) => {
17 const resourceType = req.resourceType();
18 if (resourceType === 'image' || resourceType === 'font' || resourceType === 'media') {
19 req.abort();
20 } else {
21 req.continue();
22 }
23 });
24
25
26 await page.setExtraHTTPHeaders({
27 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
28 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
29 });
30
31 } catch (error) {
32 log.warning(`Failed to prepare page: ${error.message}`);
33 }
34}
35
36
37
38
39export async function navigateToUrl(page, url, maxRetries = 3) {
40 for (let attempt = 1; attempt <= maxRetries; attempt++) {
41 try {
42 log.info(`Navigation attempt ${attempt}/${maxRetries} for ${url}`);
43
44 const response = await page.goto(url, {
45 waitUntil: 'networkidle2',
46 timeout: 30000
47 });
48
49 if (response && response.status() < 400) {
50 return true;
51 }
52
53 log.warning(`HTTP ${response?.status()} for ${url}, attempt ${attempt}`);
54
55 } catch (error) {
56 log.warning(`Navigation attempt ${attempt} failed for ${url}: ${error.message}`);
57
58 if (attempt === maxRetries) {
59 return false;
60 }
61
62
63 await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
64 }
65 }
66
67 return false;
68}
69
70
71
72
73export async function detectTechnologies(page, url, config) {
74 const result = {
75 isShopify: false,
76 isWordPress: false,
77 detectionMethod: null
78 };
79
80 try {
81
82 const urlLower = url.toLowerCase();
83 if (urlLower.includes('myshopify.com') ||
84 urlLower.includes('shopifycdn.com') ||
85 urlLower.includes('shopifycs.com')) {
86 result.isShopify = true;
87 result.detectionMethod = 'URL';
88 return result;
89 }
90
91 if (urlLower.includes('wordpress.com') ||
92 urlLower.includes('wp-content') ||
93 urlLower.includes('wp-includes')) {
94 result.isWordPress = true;
95 result.detectionMethod = 'URL';
96 return result;
97 }
98
99
100 const response = await page.goto(url, { waitUntil: 'networkidle2' });
101 const headers = response?.headers() || {};
102
103
104 if (headers['x-shopify-stage'] ||
105 headers['x-shopify-shop-api-call-limit'] ||
106 headers['x-shopify-request-id'] ||
107 (headers['server'] && headers['server'].includes('nginx/1.14.2'))) {
108 result.isShopify = true;
109 result.detectionMethod = 'HTTP Headers';
110 return result;
111 }
112
113
114 if (headers['x-pingback'] ||
115 (headers['link'] && headers['link'].includes('wp-json'))) {
116 result.isWordPress = true;
117 result.detectionMethod = 'HTTP Headers';
118 return result;
119 }
120
121
122 await page.waitForTimeout(3000);
123 await page.waitForLoadState?.('networkidle') || Promise.resolve();
124
125
126 const detectionResult = await page.evaluate(() => {
127 const shopifyIndicators = [
128
129 'Shopify',
130 'Shopify.theme',
131 'Shopify.shop',
132 'Shopify.currency',
133 'ShopifyAnalytics',
134 'window.Shopify',
135
136
137 'shopify-digital-wallet',
138 'shopify-checkout-api-token',
139 'theme-color',
140
141
142 'shopify-section',
143 'shopify-policy-list',
144 'shopify-product-form',
145
146
147 'shopifycdn.com',
148 'assets/theme.js',
149 'assets/option_selection.js',
150 '/wpm@',
151 'monorail-edge.shopifysvc.com'
152 ];
153
154 const wordpressIndicators = [
155 'wp-content',
156 'wp-includes',
157 'wp-admin',
158 'wordpress',
159 '/wp-json/',
160 'wp-embed',
161 'wlwmanifest',
162 'xmlrpc.php'
163 ];
164
165 let shopifyDetected = false;
166 let wordpressDetected = false;
167 let method = 'DOM';
168
169
170 if (typeof window.Shopify !== 'undefined' ||
171 typeof window.ShopifyAnalytics !== 'undefined') {
172 shopifyDetected = true;
173 method = 'JavaScript objects';
174 }
175
176
177 const htmlContent = document.documentElement.outerHTML.toLowerCase();
178
179
180 for (const indicator of shopifyIndicators) {
181 if (htmlContent.includes(indicator.toLowerCase())) {
182 shopifyDetected = true;
183 if (method === 'DOM') method = `DOM (${indicator})`;
184 break;
185 }
186 }
187
188
189 if (!shopifyDetected) {
190 for (const indicator of wordpressIndicators) {
191 if (htmlContent.includes(indicator.toLowerCase())) {
192 wordpressDetected = true;
193 if (method === 'DOM') method = `DOM (${indicator})`;
194 break;
195 }
196 }
197 }
198
199
200 const generators = document.querySelectorAll('meta[name="generator"]');
201 generators.forEach(gen => {
202 const content = gen.getAttribute('content')?.toLowerCase() || '';
203 if (content.includes('shopify')) {
204 shopifyDetected = true;
205 method = 'Meta generator';
206 } else if (content.includes('wordpress')) {
207 wordpressDetected = true;
208 method = 'Meta generator';
209 }
210 });
211
212
213 const links = document.querySelectorAll('link[rel="stylesheet"]');
214 links.forEach(link => {
215 const href = link.getAttribute('href')?.toLowerCase() || '';
216 if (href.includes('shopifycdn.com') || href.includes('assets/theme.css')) {
217 shopifyDetected = true;
218 method = 'CSS links';
219 } else if (href.includes('wp-content') || href.includes('wp-includes')) {
220 wordpressDetected = true;
221 method = 'CSS links';
222 }
223 });
224
225 return {
226 shopify: shopifyDetected,
227 wordpress: wordpressDetected,
228 method: method
229 };
230 });
231
232 if (detectionResult.shopify) {
233 result.isShopify = true;
234 result.detectionMethod = detectionResult.method;
235 return result;
236 }
237
238 if (detectionResult.wordpress) {
239 result.isWordPress = true;
240 result.detectionMethod = detectionResult.method;
241 return result;
242 }
243
244
245 log.info(`Checking network requests for ${url}`);
246 const networkDetection = await checkNetworkRequests(page, url);
247 if (networkDetection.isShopify) {
248 result.isShopify = true;
249 result.detectionMethod = 'Network requests';
250 return result;
251 }
252
253 if (networkDetection.isWordPress) {
254 result.isWordPress = true;
255 result.detectionMethod = 'Network requests';
256 return result;
257 }
258
259
260 const apiDetection = await testApiEndpoints(page, url);
261 if (apiDetection.isShopify) {
262 result.isShopify = true;
263 result.detectionMethod = 'API endpoints';
264 return result;
265 }
266
267 if (apiDetection.isWordPress) {
268 result.isWordPress = true;
269 result.detectionMethod = 'API endpoints';
270 return result;
271 }
272
273 } catch (error) {
274 log.error(`Detection error for ${url}: ${error.message}`);
275 }
276
277 return result;
278}
279
280
281
282
283async function checkNetworkRequests(page, url) {
284 return new Promise((resolve) => {
285 let shopifyDetected = false;
286 let wordpressDetected = false;
287
288 const timeout = setTimeout(() => {
289 resolve({ isShopify: shopifyDetected, isWordPress: wordpressDetected });
290 }, 5000);
291
292 const requestHandler = (request) => {
293 const requestUrl = request.url().toLowerCase();
294
295 if (requestUrl.includes('shopifycdn.com') ||
296 requestUrl.includes('shopifycs.com') ||
297 requestUrl.includes('monorail-edge.shopifysvc.com') ||
298 requestUrl.includes('/cart/add') ||
299 requestUrl.includes('/products.json')) {
300 shopifyDetected = true;
301 clearTimeout(timeout);
302 page.off('request', requestHandler);
303 resolve({ isShopify: true, isWordPress: false });
304 } else if (requestUrl.includes('wp-content') ||
305 requestUrl.includes('wp-includes') ||
306 requestUrl.includes('wp-admin') ||
307 requestUrl.includes('wp-json')) {
308 wordpressDetected = true;
309 clearTimeout(timeout);
310 page.off('request', requestHandler);
311 resolve({ isShopify: false, isWordPress: true });
312 }
313 };
314
315 page.on('request', requestHandler);
316
317
318 page.evaluate(() => {
319
320 window.scrollTo(0, document.body.scrollHeight / 2);
321
322
323 if (window.fetch) {
324 fetch('/cart.js').catch(() => {});
325 fetch('/products.json?limit=1').catch(() => {});
326 }
327 }).catch(() => {});
328 });
329}
330
331
332
333
334async function testApiEndpoints(page, baseUrl) {
335 const result = { isShopify: false, isWordPress: false };
336
337
338 const shopifyEndpoints = [
339 '/products.json',
340 '/cart.js',
341 '/collections.json',
342 '/api/2023-01/products.json'
343 ];
344
345
346 const wordpressEndpoints = [
347 '/wp-json/',
348 '/wp-json/wp/v2/',
349 '/xmlrpc.php',
350 '/?rest_route=/'
351 ];
352
353 try {
354
355 for (const endpoint of shopifyEndpoints) {
356 try {
357 const testUrl = new URL(endpoint, baseUrl).href;
358 const response = await page.goto(testUrl, {
359 waitUntil: 'networkidle2',
360 timeout: 10000
361 });
362
363 if (response && response.status() === 200) {
364 const contentType = response.headers()['content-type'] || '';
365 if (contentType.includes('application/json')) {
366 const text = await response.text();
367 if (text.includes('products') || text.includes('handle') || text.includes('variants')) {
368 result.isShopify = true;
369 return result;
370 }
371 }
372 }
373 } catch (e) {
374
375 }
376 }
377
378
379 if (!result.isShopify) {
380 for (const endpoint of wordpressEndpoints) {
381 try {
382 const testUrl = new URL(endpoint, baseUrl).href;
383 const response = await page.goto(testUrl, {
384 waitUntil: 'networkidle2',
385 timeout: 10000
386 });
387
388 if (response && response.status() === 200) {
389 const text = await response.text();
390 if (text.includes('wp:') ||
391 text.includes('wordpress') ||
392 text.includes('wp-json') ||
393 text.includes('methodName')) {
394 result.isWordPress = true;
395 return result;
396 }
397 }
398 } catch (e) {
399
400 }
401 }
402 }
403 } catch (error) {
404 log.warning(`API endpoint testing failed: ${error.message}`);
405 }
406
407 return result;
408}
409
410
411
412
413export async function detectEmailTools(page, emailTools) {
414 const results = {};
415
416
417 for (const tool of Object.keys(emailTools)) {
418 results[tool] = false;
419 }
420
421 try {
422 const detection = await page.evaluate((toolsConfig) => {
423 const detected = {};
424 const htmlContent = document.documentElement.outerHTML.toLowerCase();
425
426 for (const [toolName, indicators] of Object.entries(toolsConfig)) {
427 detected[toolName] = false;
428
429 for (const indicator of indicators) {
430 if (htmlContent.includes(indicator.toLowerCase())) {
431 detected[toolName] = true;
432 break;
433 }
434 }
435 }
436
437 return detected;
438 }, emailTools);
439
440 Object.assign(results, detection);
441
442 } catch (error) {
443 log.warning(`Email tools detection failed: ${error.message}`);
444 }
445
446 return results;
447}