Recherche CMS avatar
Recherche CMS

Under maintenance

Pricing

Pay per usage

Go to Store
Recherche CMS

Recherche CMS

Under maintenance

Developed by

TML

TML

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

4

Monthly users

2

Runs succeeded

6.7%

Last modified

a day ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-1",
"title": "Project Puppeteer Crawler JavaScript",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Analyse CMS et Marketing Email - Version Améliorée",
"type": "object",
"schemaVersion": 1,
"description": "Analysez les sites web pour détecter leur CMS (Shopify/WordPress) et leurs outils d'email marketing avec une détection Shopify améliorée",
"properties": {
"keywords": {
"title": "Termes de recherche",
"type": "array",
"description": "Entrez un ou plusieurs mots-clés à rechercher sur Google (un par ligne)",
"editor": "stringList",
"default": ["vêtements", "mode"],
"sectionCaption": "Paramètres de recherche",
"sectionDescription": "Configurez votre recherche Google"
},
"maxPages": {
"title": "Nombre de pages Google par mot-clé",
"type": "integer",
"description": "Nombre de pages de résultats Google à analyser pour chaque mot-clé",
"default": 3,
"minimum": 1,
"maximum": 10,
"unit": "page(s)"
},
"maxUrlsToAnalyze": {
"title": "Nombre de sites à analyser par mot-clé",
"type": "integer",
"description": "Combien de sites souhaitez-vous scanner pour chaque mot-clé",
"default": 30,
"minimum": 5,
"maximum": 100,
"unit": "site(s)"
},
"country": {
"title": "Pays pour les résultats",
"type": "string",
"description": "Choisissez le pays pour lequel vous souhaitez voir les résultats Google",
"editor": "select",
"default": "fr",
"enum": ["fr", "be", "ch", "ca", "ma", "us", "gb"],
"enumTitles": ["France", "Belgique", "Suisse", "Canada", "Maroc", "États-Unis", "Royaume-Uni"]
}
},
"required": ["keywords"]
}

src/config.js

1/**
2 * Enhanced configuration for the crawler
3 */
4export const CONFIG = {
5 // Search parameters
6 DEFAULT_COUNTRY: 'fr',
7 MAX_PAGES_PER_KEYWORD: 3,
8 MAX_URLS_TO_ANALYZE: 30,
9
10 // Google search settings
11 GOOGLE_DOMAINS: {
12 'fr': 'google.fr',
13 'be': 'google.be',
14 'ch': 'google.ch',
15 'ca': 'google.ca',
16 'ma': 'google.co.ma',
17 'us': 'google.com',
18 'gb': 'google.co.uk'
19 },
20
21 // Browser configuration
22 BROWSER_OPTIONS: {
23 headless: true,
24 args: [
25 '--no-sandbox',
26 '--disable-setuid-sandbox',
27 '--disable-dev-shm-usage',
28 '--disable-accelerated-2d-canvas',
29 '--no-first-run',
30 '--no-zygote',
31 '--disable-gpu',
32 '--window-size=1366,768',
33 '--hide-scrollbars',
34 '--disable-notifications',
35 '--disable-background-timer-throttling',
36 '--disable-backgrounding-occluded-windows',
37 '--disable-renderer-backgrounding',
38 '--disable-features=TranslateUI',
39 '--disable-ipc-flooding-protection'
40 ]
41 },
42
43 // Crawler settings
44 CRAWLER_OPTIONS: {
45 maxConcurrency: 3,
46 navigationTimeoutSecs: 45,
47 maxRequestRetries: 2
48 },
49
50 // User agents for rotation
51 USER_AGENTS: [
52 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
53 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
54 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
55 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
56 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'
57 ],
58
59 // Enhanced Shopify detection indicators
60 SHOPIFY_INDICATORS: {
61 // URL patterns
62 urls: [
63 'myshopify.com',
64 'shopifycdn.com',
65 'shopifycs.com',
66 'shopify-analytics.com',
67 'monorail-edge.shopifysvc.com'
68 ],
69
70 // HTTP headers
71 headers: [
72 'x-shopify-stage',
73 'x-shopify-shop-api-call-limit',
74 'x-shopify-request-id',
75 'x-shopify-shop-domain',
76 'x-shopify-api-version'
77 ],
78
79 // JavaScript objects
80 javascript: [
81 'Shopify',
82 'Shopify.theme',
83 'Shopify.shop',
84 'Shopify.currency',
85 'ShopifyAnalytics',
86 'Shopify.checkout',
87 'Shopify.routes'
88 ],
89
90 // DOM elements and classes
91 dom: [
92 'shopify-section',
93 'shopify-policy-list',
94 'shopify-product-form',
95 'shopify-payment-button',
96 'shopify-cleanslate',
97 'shopify-product-reviews-badge'
98 ],
99
100 // Meta tags
101 meta: [
102 'shopify-digital-wallet',
103 'shopify-checkout-api-token',
104 'shopify-checkout-domain'
105 ],
106
107 // Script and CSS sources
108 assets: [
109 'assets/theme.js',
110 'assets/option_selection.js',
111 'assets/theme.css',
112 '/wpm@',
113 'cdn.shopify.com',
114 'v3.shopifycdn.com'
115 ],
116
117 // API endpoints
118 endpoints: [
119 '/products.json',
120 '/cart.js',
121 '/collections.json',
122 '/cart/add',
123 '/cart/update',
124 '/api/2023-01/',
125 '/api/2024-01/'
126 ],
127
128 // Text patterns in HTML
129 textPatterns: [
130 'powered by shopify',
131 'shopify-checkout',
132 'shopify-features',
133 'shopify-money-format',
134 'var shopifycdn'
135 ]
136 },
137
138 // Enhanced WordPress detection indicators
139 WORDPRESS_INDICATORS: {
140 // URL patterns
141 urls: [
142 'wp-content',
143 'wp-includes',
144 'wp-admin',
145 'wordpress.com',
146 'wp-json'
147 ],
148
149 // HTTP headers
150 headers: [
151 'x-pingback',
152 'x-powered-by-wordpress'
153 ],
154
155 // DOM elements and classes
156 dom: [
157 'wp-block',
158 'wp-container',
159 'wp-site-blocks',
160 'wp-block-group',
161 'wp-block-column',
162 'wp-block-paragraph',
163 'wp-embed',
164 'wp-caption',
165 'wp-image'
166 ],
167
168 // Meta tags
169 meta: [
170 'generator',
171 'wlwmanifest'
172 ],
173
174 // Script and CSS sources
175 assets: [
176 'wp-content/themes',
177 'wp-content/plugins',
178 'wp-includes/js',
179 'wp-includes/css',
180 'wp-emoji-release.min.js'
181 ],
182
183 // API endpoints
184 endpoints: [
185 '/wp-json/',
186 '/wp-json/wp/v2/',
187 '/xmlrpc.php',
188 '/?rest_route=/',
189 '/wp-admin/admin-ajax.php'
190 ],
191
192 // Text patterns in HTML
193 textPatterns: [
194 'powered by wordpress',
195 'wp-json',
196 'wp_enqueue_script',
197 'wp_head',
198 'wordpress'
199 ]
200 },
201
202 // Email marketing tools detection
203 EMAIL_TOOLS: {
204 klaviyo: [
205 'klaviyo',
206 'klaviyo.com',
207 'static.klaviyo.com',
208 'kla.js'
209 ],
210 mailchimp: [
211 'mailchimp',
212 'mc.us',
213 'chimpstatic.com',
214 'mc4wp',
215 'mailchimp-woocommerce'
216 ],
217 brevo: [
218 'brevo',
219 'sendinblue',
220 'sibforms',
221 'mailin.fr',
222 'sib-api'
223 ],
224 hubspot: [
225 'hubspot',
226 'hs-scripts.com',
227 'hsforms.net',
228 'hubapi.com'
229 ],
230 salesforce: [
231 'salesforce',
232 'pardot',
233 'exacttarget',
234 'marketingcloud'
235 ],
236 convertkit: [
237 'convertkit',
238 'ck.page',
239 'convertkit-mail'
240 ],
241 aweber: [
242 'aweber',
243 'aweber.com',
244 'forms.aweber.com'
245 ],
246 getresponse: [
247 'getresponse',
248 'getresponse.com',
249 'gr-cdn.com'
250 ],
251 activecampaign: [
252 'activecampaign',
253 'activehosted.com',
254 'trackcmp.net'
255 ],
256 constantcontact: [
257 'constantcontact',
258 'ctctcdn.com',
259 'constantcontacts.com'
260 ],
261 omnisend: [
262 'omnisend',
263 'omnisrc.com',
264 'omnisend.com'
265 ],
266 privy: [
267 'privy',
268 'privy.com',
269 'widget.privy.com'
270 ],
271 yotpo: [
272 'yotpo',
273 'staticw2.yotpo.com',
274 'yotpo.com'
275 ],
276 attentive: [
277 'attentive',
278 'attentivemobile.com',
279 'attn.tv'
280 ]
281 },
282
283 // Fallback URLs for testing
284 FALLBACK_URLS: [
285 'https://www.shopify.com',
286 'https://wordpress.org',
287 'https://example.com'
288 ],
289
290 // Delays and timeouts
291 DELAYS: {
292 betweenRequests: 1000,
293 pageLoad: 3000,
294 networkIdle: 5000,
295 apiTest: 10000
296 },
297
298 // Detection weights (for scoring system)
299 DETECTION_WEIGHTS: {
300 url: 10,
301 headers: 9,
302 javascript: 8,
303 meta: 7,
304 dom: 6,
305 assets: 5,
306 endpoints: 8,
307 textPatterns: 4
308 }
309};
310 '

src/googleSearch.js

1/**
2 * Google search functionality
3 */
4import { log } from 'crawlee';
5import { PuppeteerCrawler } from 'crawlee';
6import { getRandomUserAgent, normalizeUrl, removeDuplicates, sleep } from './utils.js';
7
8/**
9 * Search Google for URLs based on keyword
10 */
11export async function searchGoogleForUrls(keyword, country, config) {
12 log.info(`🔍 Starting Google search for: "${keyword}" in ${country}`);
13
14 const googleDomain = config.GOOGLE_DOMAINS[country] || config.GOOGLE_DOMAINS['fr'];
15 const maxPages = config.MAX_PAGES_PER_KEYWORD || 3;
16
17 let allUrls = [];
18
19 // Create a crawler for Google search
20 const searchCrawler = new PuppeteerCrawler({
21 launchContext: {
22 launchOptions: config.BROWSER_OPTIONS,
23 },
24 maxConcurrency: 1, // Keep it low to avoid being blocked
25 navigationTimeoutSecs: 30,
26 maxRequestRetries: 2,
27
28 async requestHandler({ page, request }) {
29 try {
30 const pageNumber = request.userData.pageNumber || 1;
31 log.info(`📄 Searching Google page ${pageNumber} for "${keyword}"`);
32
33 // Set user agent
34 const userAgent = getRandomUserAgent(config.USER_AGENTS);
35 await page.setUserAgent(userAgent);
36 await page.setViewport({ width: 1366, height: 768 });
37
38 // Add some randomness to avoid detection
39 await page.evaluateOnNewDocument(() => {
40 // Override the navigator.webdriver property
41 Object.defineProperty(navigator, 'webdriver', {
42 get: () => undefined,
43 });
44
45 // Override the chrome property
46 window.chrome = {
47 runtime: {},
48 loadTimes: function() {},
49 csi: function() {},
50 app: {}
51 };
52
53 // Override plugins
54 Object.defineProperty(navigator, 'plugins', {
55 get: () => [1, 2, 3, 4, 5],
56 });
57
58 // Override languages
59 Object.defineProperty(navigator, 'languages', {
60 get: () => ['fr-FR', 'fr', 'en'],
61 });
62 });
63
64 // Navigate to Google search
65 const searchUrl = request.url;
66 log.info(`🌐 Navigating to: ${searchUrl}`);
67
68 await page.goto(searchUrl, {
69 waitUntil: 'networkidle2',
70 timeout: 30000
71 });
72
73 // Wait for search results to load
74 await page.waitForTimeout(2000);
75
76 // Handle cookie consent if present
77 try {
78 const acceptButton = await page.$('button[id*="accept"], button[id*="consent"], #L2AGLb');
79 if (acceptButton) {
80 await acceptButton.click();
81 await page.waitForTimeout(1000);
82 }
83 } catch (e) {
84 log.info('No cookie consent found or already handled');
85 }
86
87 // Extract search result URLs
88 const urls = await page.evaluate(() => {
89 const results = [];
90
91 // Multiple selectors for different Google layouts
92 const selectors = [
93 'a[href^="http"]:not([href*="google."])',
94 'div.g a[href^="http"]',
95 'div[data-ved] a[href^="http"]',
96 '.rc a[href^="http"]',
97 'div.yuRUbf a[href^="http"]'
98 ];
99
100 selectors.forEach(selector => {
101 const elements = document.querySelectorAll(selector);
102 elements.forEach(element => {
103 const href = element.href;
104 if (href &&
105 !href.includes('google.') &&
106 !href.includes('youtube.com') &&
107 !href.includes('facebook.com') &&
108 !href.includes('instagram.com') &&
109 !href.includes('twitter.com') &&
110 !href.includes('linkedin.com') &&
111 !href.includes('wikipedia.org') &&
112 !href.includes('webcache.googleusercontent.com')) {
113 results.push(href);
114 }
115 });
116 });
117
118 return results;
119 });
120
121 log.info(`Found ${urls.length} URLs on page ${pageNumber}`);
122
123 // Add URLs to the collection
124 allUrls.push(...urls);
125
126 } catch (error) {
127 log.error(`Error searching Google page: ${error.message}`);
128 throw error;
129 }
130 },
131
132 failedRequestHandler({ request, error }) {
133 log.error(`Google search request failed: ${error.message}`);
134 }
135 });
136
137 // Generate search URLs for each page
138 const searchUrls = [];
139 for (let page = 1; page <= maxPages; page++) {
140 const start = (page - 1) * 10;
141 const searchUrl = `https://${googleDomain}/search?q=${encodeURIComponent(keyword)}&start=${start}&num=10&hl=fr&gl=${country.toLowerCase()}`;
142
143 searchUrls.push({
144 url: searchUrl,
145 userData: { pageNumber: page }
146 });
147 }
148
149 // Add search requests to crawler
150 await searchCrawler.addRequests(searchUrls);
151
152 // Run the search crawler
153 await searchCrawler.run();
154
155 // Clean and deduplicate URLs
156 let cleanedUrls = allUrls
157 .map(url => {
158 try {
159 // Clean Google redirect URLs
160 if (url.includes('google.') && url.includes('url?q=')) {
161 const urlParams = new URLSearchParams(url.split('?')[1]);
162 return urlParams.get('q') || url;
163 }
164 return url;
165 } catch (e) {
166 return url;
167 }
168 })
169 .filter(url => {
170 try {
171 new URL(url);
172 return true;
173 } catch (e) {
174 return false;
175 }
176 })
177 .map(url => normalizeUrl(url));
178
179 // Remove duplicates
180 cleanedUrls = removeDuplicates(cleanedUrls);
181
182 log.info(`✅ Google search completed. Found ${cleanedUrls.length} unique URLs for "${keyword}"`);
183
184 // Log some sample URLs for debugging
185 if (cleanedUrls.length > 0) {
186 log.info(`Sample URLs found: ${cleanedUrls.slice(0, 3).join(', ')}`);
187 }
188
189 return cleanedUrls;
190}
191
192/**
193 * Alternative Google search using direct HTTP requests (fallback)
194 */
195export async function searchGoogleAlternative(keyword, country, config) {
196 log.info(`🔄 Using alternative Google search for: "${keyword}"`);
197
198 // This is a simplified version that could be implemented
199 // using HTTP requests instead of browser automation
200 // For now, return empty array to trigger fallback URLs
201
202 return [];
203}
204
205/**
206 * Extract and clean URLs from Google search results HTML
207 */
208function extractUrlsFromHtml(html) {
209 const urls = [];
210
211 // Regular expressions to match URLs in Google search results
212 const urlPatterns = [
213 /href="(https?:\/\/[^"]+)"/g,
214 /data-href="(https?:\/\/[^"]+)"/g
215 ];
216
217 urlPatterns.forEach(pattern => {
218 let match;
219 while ((match = pattern.exec(html)) !== null) {
220 const url = match[1];
221
222 // Filter out Google URLs and other unwanted domains
223 if (!url.includes('google.') &&
224 !url.includes('youtube.com') &&
225 !url.includes('facebook.com') &&
226 !url.includes('instagram.com') &&
227 !url.includes('twitter.com') &&
228 !url.includes('linkedin.com') &&
229 !url.includes('wikipedia.org')) {
230 urls.push(url);
231 }
232 }
233 });
234
235 return urls;
236}
237
238/**
239 * Generate search query with variations
240 */
241export function generateSearchQueries(baseKeyword) {
242 const queries = [baseKeyword];
243
244 // Add some variations to get more diverse results
245 queries.push(`"${baseKeyword}" site:*.com`);
246 queries.push(`${baseKeyword} boutique`);
247 queries.push(`${baseKeyword} shop`);
248 queries.push(`${baseKeyword} store`);
249
250 return queries;
251}
252
253/**
254 * Validate and filter URLs
255 */
256function isValidSearchResult(url) {
257 try {
258 const urlObj = new URL(url);
259 const hostname = urlObj.hostname.toLowerCase();
260
261 // Exclude certain domains
262 const excludedDomains = [
263 'google.',
264 'youtube.com',
265 'facebook.com',
266 'instagram.com',
267 'twitter.com',
268 'linkedin.com',
269 'wikipedia.org',
270 'amazon.',
271 'ebay.',
272 'aliexpress.com',
273 'alibaba.com'
274 ];
275
276 return !excludedDomains.some(domain => hostname.includes(domain));
277 } catch (e) {
278 return false;
279 }
280}

src/main.js

1/**
2 * Apify Actor for analyzing Shopify and WordPress usage from Google search results
3 * Enhanced version with improved Shopify detection
4 */
5import { Actor } from 'apify';
6import { PuppeteerCrawler, log } from 'crawlee';
7import { CONFIG } from './src/config.js';
8import {
9 getRandomUserAgent,
10 initializeResults,
11 formatKeywordResults
12} from './src/utils.js';
13import { searchGoogleForUrls } from './src/googleSearch.js';
14import {
15 detectTechnologies,
16 detectEmailTools,
17 preparePage,
18 navigateToUrl
19} from './src/siteAnalyzer.js';
20
21// Main Actor function
22await Actor.main(async () => {
23 try {
24 log.info('Starting the enhanced Shopify/WordPress analyzer');
25
26 // Check if we're in test mode
27 const isInTestMode = process.env.APIFY_IS_AT_HOME;
28
29 // Get and process input parameters
30 const input = await Actor.getInput() || {};
31 const {
32 keywords = ['test'],
33 country = CONFIG.DEFAULT_COUNTRY,
34 maxPages = isInTestMode ? 1 : CONFIG.MAX_PAGES_PER_KEYWORD,
35 maxUrlsToAnalyze = isInTestMode ? 3 : CONFIG.MAX_URLS_TO_ANALYZE
36 } = input;
37
38 // Validate input
39 if (!keywords || !Array.isArray(keywords) || keywords.length === 0) {
40 throw new Error('At least one keyword is required');
41 }
42
43 log.info(`Analyzing ${keywords.length} keywords with up to ${maxUrlsToAnalyze} URLs per keyword`);
44
45 // Results container for all keywords
46 const allResults = {};
47
48 // Process each keyword
49 for (const keyword of keywords) {
50 // Skip empty keywords
51 if (!keyword || keyword.trim() === '') {
52 log.warning('Empty keyword detected, skipping');
53 continue;
54 }
55
56 log.info(`\n========== ANALYZING KEYWORD: ${keyword} ==========`);
57
58 // Step 1: Search Google and get URLs
59 let urlsToAnalyze = [];
60 try {
61 const searchResults = await searchGoogleForUrls(keyword, country, CONFIG);
62
63 if (searchResults.length > 0) {
64 // Take only the specified number of URLs
65 urlsToAnalyze = searchResults.slice(0, maxUrlsToAnalyze);
66 log.info(`Found ${urlsToAnalyze.length} URLs to analyze for "${keyword}"`);
67 } else {
68 log.warning(`No URLs found for "${keyword}", using fallback URLs`);
69 urlsToAnalyze = [...CONFIG.FALLBACK_URLS];
70 }
71 } catch (searchError) {
72 log.error(`Search failed for "${keyword}": ${searchError.message}`);
73 urlsToAnalyze = [...CONFIG.FALLBACK_URLS];
74 log.info(`Using ${urlsToAnalyze.length} fallback URLs`);
75 }
76
77 // Skip if no URLs were found
78 if (urlsToAnalyze.length === 0) {
79 log.info(`No URLs to analyze for "${keyword}", skipping`);
80 allResults[keyword] = {
81 message: "No URLs found for this search",
82 timestamp: new Date().toISOString()
83 };
84 continue;
85 }
86
87 // Initialize results for this keyword
88 const results = initializeResults();
89
90 // Set up proxy configuration for the crawler
91 const proxyConfig = isInTestMode ?
92 undefined :
93 await Actor.createProxyConfiguration({
94 useApifyProxy: true,
95 apifyProxyGroups: ['RESIDENTIAL']
96 });
97
98 // Step 2: Set up crawler to analyze each URL
99 const crawler = new PuppeteerCrawler({
100 proxyConfiguration: proxyConfig,
101 launchContext: {
102 launchOptions: CONFIG.BROWSER_OPTIONS,
103 },
104 maxConcurrency: CONFIG.CRAWLER_OPTIONS.maxConcurrency,
105 navigationTimeoutSecs: CONFIG.CRAWLER_OPTIONS.navigationTimeoutSecs,
106 maxRequestRetries: CONFIG.CRAWLER_OPTIONS.maxRequestRetries,
107
108 // Handle each URL
109 async requestHandler({ page, request }) {
110 const url = request.url;
111 log.info(`Analyzing: ${url}`);
112
113 try {
114 // Prepare the page
115 const userAgent = getRandomUserAgent(CONFIG.USER_AGENTS);
116 await preparePage(page, userAgent);
117
118 // Navigate to the URL
119 const navigationSuccessful = await navigateToUrl(page, url);
120
121 if (!navigationSuccessful) {
122 throw new Error("Failed to load page after multiple attempts");
123 }
124
125 // Enhanced detection with multiple passes
126 log.info(`🔍 Starting enhanced detection for ${url}`);
127
128 // Detect technologies (Shopify, WordPress) with enhanced methods
129 const siteInfo = await detectTechnologies(page, url, CONFIG);
130
131 // Detect email marketing tools
132 const emailInfo = await detectEmailTools(page, CONFIG.EMAIL_TOOLS);
133
134 // Log detection details
135 if (siteInfo.detectionMethod) {
136 log.info(`✅ Detection method: ${siteInfo.detectionMethod}`);
137 }
138
139 // Categorize the site
140 if (siteInfo.isShopify) {
141 results.shopify.push(url);
142
143 // Check which email tools are used
144 for (const tool in emailInfo) {
145 if (emailInfo[tool]) {
146 const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);
147 results[`shopifyWith${capitalizedTool}`].push(url);
148 }
149 }
150
151 log.info(`${url}: Shopify detected (${siteInfo.detectionMethod || 'standard'})`);
152 } else if (siteInfo.isWordPress) {
153 results.wordpress.push(url);
154
155 // Check which email tools are used
156 for (const tool in emailInfo) {
157 if (emailInfo[tool]) {
158 const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);
159 results[`wordpressWith${capitalizedTool}`].push(url);
160 }
161 }
162
163 log.info(`${url}: WordPress detected (${siteInfo.detectionMethod || 'standard'})`);
164 } else {
165 results.autres.push(url);
166 log.info(`${url}: Other platform or undetected`);
167 }
168
169 } catch (error) {
170 log.error(`❌ Error analyzing ${url}: ${error.message}`);
171
172 // Enhanced URL-based fallback detection
173 const urlLower = url.toLowerCase();
174 if (urlLower.includes('shopify') ||
175 urlLower.includes('myshopify') ||
176 urlLower.includes('shopifycdn') ||
177 urlLower.includes('shopifycs')) {
178 results.shopify.push(url);
179 log.info(`${url}: Shopify (detected via URL fallback)`);
180 } else if (urlLower.includes('wordpress') ||
181 urlLower.includes('wp-') ||
182 urlLower.includes('wp-content') ||
183 urlLower.includes('wp-includes')) {
184 results.wordpress.push(url);
185 log.info(`${url}: WordPress (detected via URL fallback)`);
186 } else {
187 results.autres.push(url);
188 }
189 }
190 },
191
192 // Handle failures
193 failedRequestHandler({ request, error }) {
194 log.error(`Request ${request.url} failed: ${error.message}`);
195 }
196 });
197
198 // Add URLs to the crawler
199 for (const url of urlsToAnalyze) {
200 await crawler.addRequests([{ url }]);
201 }
202
203 // Run the crawler for this keyword
204 await crawler.run();
205
206 // Format results and store them
207 allResults[keyword] = formatKeywordResults(results);
208
209 log.info(`Analysis complete for "${keyword}". Sites analyzed: ${urlsToAnalyze.length}`);
210 log.info(`Results: Shopify: ${results.shopify.length}, WordPress: ${results.wordpress.length}, Other: ${results.autres.length}`);
211 }
212
213 // Save final results to dataset
214 await Actor.pushData(allResults);
215
216 log.info(`\n========== ANALYSIS COMPLETE ==========`);
217 log.info(`Keywords analyzed: ${keywords.length}`);
218
219 // Log summary statistics
220 let totalShopify = 0, totalWordPress = 0, totalOther = 0;
221 for (const keyword in allResults) {
222 if (allResults[keyword].shopify) {
223 totalShopify += allResults[keyword].shopify.length;
224 totalWordPress += allResults[keyword].wordpress.length;
225 totalOther += allResults[keyword].autres.length;
226 }
227 }
228
229 log.info(`FINAL STATS - Shopify: ${totalShopify}, WordPress: ${totalWordPress}, Other: ${totalOther}`);
230
231 } catch (error) {
232 log.error(`Main error: ${error.message}`);
233 await Actor.pushData({
234 error: error.message,
235 stackTrace: error.stack,
236 timestamp: new Date().toISOString()
237 });
238 }
239});

src/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});

src/siteAnalyzer.js

1/**
2 * Enhanced site analyzer with improved Shopify and WordPress detection
3 */
4import { log } from 'crawlee';
5
6/**
7 * Prepare page with user agent and other settings
8 */
9export async function preparePage(page, userAgent) {
10 try {
11 await page.setUserAgent(userAgent);
12 await page.setViewport({ width: 1366, height: 768 });
13
14 // Block images and fonts to speed up loading
15 await page.setRequestInterception(true);
16 page.on('request', (req) => {
17 const resourceType = req.resourceType();
18 if (resourceType === 'image' || resourceType === 'font' || resourceType === 'media') {
19 req.abort();
20 } else {
21 req.continue();
22 }
23 });
24
25 // Set extra headers
26 await page.setExtraHTTPHeaders({
27 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
28 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
29 });
30
31 } catch (error) {
32 log.warning(`Failed to prepare page: ${error.message}`);
33 }
34}
35
36/**
37 * Navigate to URL with retries
38 */
39export async function navigateToUrl(page, url, maxRetries = 3) {
40 for (let attempt = 1; attempt <= maxRetries; attempt++) {
41 try {
42 log.info(`Navigation attempt ${attempt}/${maxRetries} for ${url}`);
43
44 const response = await page.goto(url, {
45 waitUntil: 'networkidle2',
46 timeout: 30000
47 });
48
49 if (response && response.status() < 400) {
50 return true;
51 }
52
53 log.warning(`HTTP ${response?.status()} for ${url}, attempt ${attempt}`);
54
55 } catch (error) {
56 log.warning(`Navigation attempt ${attempt} failed for ${url}: ${error.message}`);
57
58 if (attempt === maxRetries) {
59 return false;
60 }
61
62 // Wait before retry
63 await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
64 }
65 }
66
67 return false;
68}
69
70/**
71 * Enhanced technology detection with multiple detection methods
72 */
73export async function detectTechnologies(page, url, config) {
74 const result = {
75 isShopify: false,
76 isWordPress: false,
77 detectionMethod: null
78 };
79
80 try {
81 // Method 1: Quick URL-based detection
82 const urlLower = url.toLowerCase();
83 if (urlLower.includes('myshopify.com') ||
84 urlLower.includes('shopifycdn.com') ||
85 urlLower.includes('shopifycs.com')) {
86 result.isShopify = true;
87 result.detectionMethod = 'URL';
88 return result;
89 }
90
91 if (urlLower.includes('wordpress.com') ||
92 urlLower.includes('wp-content') ||
93 urlLower.includes('wp-includes')) {
94 result.isWordPress = true;
95 result.detectionMethod = 'URL';
96 return result;
97 }
98
99 // Method 2: HTTP Headers detection
100 const response = await page.goto(url, { waitUntil: 'networkidle2' });
101 const headers = response?.headers() || {};
102
103 // Check Shopify-specific headers
104 if (headers['x-shopify-stage'] ||
105 headers['x-shopify-shop-api-call-limit'] ||
106 headers['x-shopify-request-id'] ||
107 (headers['server'] && headers['server'].includes('nginx/1.14.2'))) {
108 result.isShopify = true;
109 result.detectionMethod = 'HTTP Headers';
110 return result;
111 }
112
113 // Check WordPress headers
114 if (headers['x-pingback'] ||
115 (headers['link'] && headers['link'].includes('wp-json'))) {
116 result.isWordPress = true;
117 result.detectionMethod = 'HTTP Headers';
118 return result;
119 }
120
121 // Method 3: Wait for page to fully load
122 await page.waitForTimeout(3000);
123 await page.waitForLoadState?.('networkidle') || Promise.resolve();
124
125 // Method 4: Enhanced DOM-based detection
126 const detectionResult = await page.evaluate(() => {
127 const shopifyIndicators = [
128 // JavaScript objects
129 'Shopify',
130 'Shopify.theme',
131 'Shopify.shop',
132 'Shopify.currency',
133 'ShopifyAnalytics',
134 'window.Shopify',
135
136 // Meta tags
137 'shopify-digital-wallet',
138 'shopify-checkout-api-token',
139 'theme-color',
140
141 // CSS classes and IDs
142 'shopify-section',
143 'shopify-policy-list',
144 'shopify-product-form',
145
146 // Script sources
147 'shopifycdn.com',
148 'assets/theme.js',
149 'assets/option_selection.js',
150 '/wpm@',
151 'monorail-edge.shopifysvc.com'
152 ];
153
154 const wordpressIndicators = [
155 'wp-content',
156 'wp-includes',
157 'wp-admin',
158 'wordpress',
159 '/wp-json/',
160 'wp-embed',
161 'wlwmanifest',
162 'xmlrpc.php'
163 ];
164
165 let shopifyDetected = false;
166 let wordpressDetected = false;
167 let method = 'DOM';
168
169 // Check JavaScript objects
170 if (typeof window.Shopify !== 'undefined' ||
171 typeof window.ShopifyAnalytics !== 'undefined') {
172 shopifyDetected = true;
173 method = 'JavaScript objects';
174 }
175
176 // Check document content
177 const htmlContent = document.documentElement.outerHTML.toLowerCase();
178
179 // Shopify detection in HTML
180 for (const indicator of shopifyIndicators) {
181 if (htmlContent.includes(indicator.toLowerCase())) {
182 shopifyDetected = true;
183 if (method === 'DOM') method = `DOM (${indicator})`;
184 break;
185 }
186 }
187
188 // WordPress detection in HTML
189 if (!shopifyDetected) {
190 for (const indicator of wordpressIndicators) {
191 if (htmlContent.includes(indicator.toLowerCase())) {
192 wordpressDetected = true;
193 if (method === 'DOM') method = `DOM (${indicator})`;
194 break;
195 }
196 }
197 }
198
199 // Check meta generators
200 const generators = document.querySelectorAll('meta[name="generator"]');
201 generators.forEach(gen => {
202 const content = gen.getAttribute('content')?.toLowerCase() || '';
203 if (content.includes('shopify')) {
204 shopifyDetected = true;
205 method = 'Meta generator';
206 } else if (content.includes('wordpress')) {
207 wordpressDetected = true;
208 method = 'Meta generator';
209 }
210 });
211
212 // Check for specific CSS files
213 const links = document.querySelectorAll('link[rel="stylesheet"]');
214 links.forEach(link => {
215 const href = link.getAttribute('href')?.toLowerCase() || '';
216 if (href.includes('shopifycdn.com') || href.includes('assets/theme.css')) {
217 shopifyDetected = true;
218 method = 'CSS links';
219 } else if (href.includes('wp-content') || href.includes('wp-includes')) {
220 wordpressDetected = true;
221 method = 'CSS links';
222 }
223 });
224
225 return {
226 shopify: shopifyDetected,
227 wordpress: wordpressDetected,
228 method: method
229 };
230 });
231
232 if (detectionResult.shopify) {
233 result.isShopify = true;
234 result.detectionMethod = detectionResult.method;
235 return result;
236 }
237
238 if (detectionResult.wordpress) {
239 result.isWordPress = true;
240 result.detectionMethod = detectionResult.method;
241 return result;
242 }
243
244 // Method 5: Network requests analysis
245 log.info(`Checking network requests for ${url}`);
246 const networkDetection = await checkNetworkRequests(page, url);
247 if (networkDetection.isShopify) {
248 result.isShopify = true;
249 result.detectionMethod = 'Network requests';
250 return result;
251 }
252
253 if (networkDetection.isWordPress) {
254 result.isWordPress = true;
255 result.detectionMethod = 'Network requests';
256 return result;
257 }
258
259 // Method 6: API endpoint testing
260 const apiDetection = await testApiEndpoints(page, url);
261 if (apiDetection.isShopify) {
262 result.isShopify = true;
263 result.detectionMethod = 'API endpoints';
264 return result;
265 }
266
267 if (apiDetection.isWordPress) {
268 result.isWordPress = true;
269 result.detectionMethod = 'API endpoints';
270 return result;
271 }
272
273 } catch (error) {
274 log.error(`Detection error for ${url}: ${error.message}`);
275 }
276
277 return result;
278}
279
280/**
281 * Check network requests for platform indicators
282 */
283async function checkNetworkRequests(page, url) {
284 return new Promise((resolve) => {
285 let shopifyDetected = false;
286 let wordpressDetected = false;
287
288 const timeout = setTimeout(() => {
289 resolve({ isShopify: shopifyDetected, isWordPress: wordpressDetected });
290 }, 5000);
291
292 const requestHandler = (request) => {
293 const requestUrl = request.url().toLowerCase();
294
295 if (requestUrl.includes('shopifycdn.com') ||
296 requestUrl.includes('shopifycs.com') ||
297 requestUrl.includes('monorail-edge.shopifysvc.com') ||
298 requestUrl.includes('/cart/add') ||
299 requestUrl.includes('/products.json')) {
300 shopifyDetected = true;
301 clearTimeout(timeout);
302 page.off('request', requestHandler);
303 resolve({ isShopify: true, isWordPress: false });
304 } else if (requestUrl.includes('wp-content') ||
305 requestUrl.includes('wp-includes') ||
306 requestUrl.includes('wp-admin') ||
307 requestUrl.includes('wp-json')) {
308 wordpressDetected = true;
309 clearTimeout(timeout);
310 page.off('request', requestHandler);
311 resolve({ isShopify: false, isWordPress: true });
312 }
313 };
314
315 page.on('request', requestHandler);
316
317 // Trigger some page interactions to generate requests
318 page.evaluate(() => {
319 // Scroll to trigger lazy loading
320 window.scrollTo(0, document.body.scrollHeight / 2);
321
322 // Try to trigger cart-related requests (Shopify)
323 if (window.fetch) {
324 fetch('/cart.js').catch(() => {});
325 fetch('/products.json?limit=1').catch(() => {});
326 }
327 }).catch(() => {});
328 });
329}
330
331/**
332 * Test common API endpoints
333 */
334async function testApiEndpoints(page, baseUrl) {
335 const result = { isShopify: false, isWordPress: false };
336
337 // Shopify endpoints to test
338 const shopifyEndpoints = [
339 '/products.json',
340 '/cart.js',
341 '/collections.json',
342 '/api/2023-01/products.json'
343 ];
344
345 // WordPress endpoints to test
346 const wordpressEndpoints = [
347 '/wp-json/',
348 '/wp-json/wp/v2/',
349 '/xmlrpc.php',
350 '/?rest_route=/'
351 ];
352
353 try {
354 // Test Shopify endpoints
355 for (const endpoint of shopifyEndpoints) {
356 try {
357 const testUrl = new URL(endpoint, baseUrl).href;
358 const response = await page.goto(testUrl, {
359 waitUntil: 'networkidle2',
360 timeout: 10000
361 });
362
363 if (response && response.status() === 200) {
364 const contentType = response.headers()['content-type'] || '';
365 if (contentType.includes('application/json')) {
366 const text = await response.text();
367 if (text.includes('products') || text.includes('handle') || text.includes('variants')) {
368 result.isShopify = true;
369 return result;
370 }
371 }
372 }
373 } catch (e) {
374 // Endpoint not available, continue
375 }
376 }
377
378 // Test WordPress endpoints if Shopify not detected
379 if (!result.isShopify) {
380 for (const endpoint of wordpressEndpoints) {
381 try {
382 const testUrl = new URL(endpoint, baseUrl).href;
383 const response = await page.goto(testUrl, {
384 waitUntil: 'networkidle2',
385 timeout: 10000
386 });
387
388 if (response && response.status() === 200) {
389 const text = await response.text();
390 if (text.includes('wp:') ||
391 text.includes('wordpress') ||
392 text.includes('wp-json') ||
393 text.includes('methodName')) {
394 result.isWordPress = true;
395 return result;
396 }
397 }
398 } catch (e) {
399 // Endpoint not available, continue
400 }
401 }
402 }
403 } catch (error) {
404 log.warning(`API endpoint testing failed: ${error.message}`);
405 }
406
407 return result;
408}
409
410/**
411 * Detect email marketing tools
412 */
413export async function detectEmailTools(page, emailTools) {
414 const results = {};
415
416 // Initialize all tools as false
417 for (const tool of Object.keys(emailTools)) {
418 results[tool] = false;
419 }
420
421 try {
422 const detection = await page.evaluate((toolsConfig) => {
423 const detected = {};
424 const htmlContent = document.documentElement.outerHTML.toLowerCase();
425
426 for (const [toolName, indicators] of Object.entries(toolsConfig)) {
427 detected[toolName] = false;
428
429 for (const indicator of indicators) {
430 if (htmlContent.includes(indicator.toLowerCase())) {
431 detected[toolName] = true;
432 break;
433 }
434 }
435 }
436
437 return detected;
438 }, emailTools);
439
440 Object.assign(results, detection);
441
442 } catch (error) {
443 log.warning(`Email tools detection failed: ${error.message}`);
444 }
445
446 return results;
447}

src/utils.js

1/**
2 * Utility functions for the crawler
3 */
4
5/**
6 * Get random user agent from the list
7 */
8export function getRandomUserAgent(userAgents) {
9 return userAgents[Math.floor(Math.random() * userAgents.length)];
10}
11
12/**
13 * Initialize results structure for a keyword
14 */
15export function initializeResults() {
16 return {
17 shopify: [],
18 wordpress: [],
19 autres: [],
20 // Shopify with email tools
21 shopifyWithKlaviyo: [],
22 shopifyWithMailchimp: [],
23 shopifyWithBrevo: [],
24 shopifyWithHubspot: [],
25 shopifyWithSalesforce: [],
26 shopifyWithConvertkit: [],
27 shopifyWithAweber: [],
28 shopifyWithGetresponse: [],
29 shopifyWithActivecampaign: [],
30 shopifyWithConstantcontact: [],
31 shopifyWithOmnisend: [],
32 shopifyWithPrivy: [],
33 shopifyWithYotpo: [],
34 shopifyWithAttentive: [],
35 // WordPress with email tools
36 wordpressWithKlaviyo: [],
37 wordpressWithMailchimp: [],
38 wordpressWithBrevo: [],
39 wordpressWithHubspot: [],
40 wordpressWithSalesforce: [],
41 wordpressWithConvertkit: [],
42 wordpressWithAweber: [],
43 wordpressWithGetresponse: [],
44 wordpressWithActivecampaign: [],
45 wordpressWithConstantcontact: [],
46 wordpressWithOmnisend: [],
47 wordpressWithPrivy: [],
48 wordpressWithYotpo: [],
49 wordpressWithAttentive: []
50 };
51}
52
53/**
54 * Format results for output
55 */
56export function formatKeywordResults(results) {
57 const formatted = {
58 timestamp: new Date().toISOString(),
59 summary: {
60 totalSites: results.shopify.length + results.wordpress.length + results.autres.length,
61 shopifyCount: results.shopify.length,
62 wordpressCount: results.wordpress.length,
63 otherCount: results.autres.length
64 },
65 shopify: results.shopify,
66 wordpress: results.wordpress,
67 autres: results.autres
68 };
69
70 // Add email tool combinations
71 const emailTools = [
72 'Klaviyo', 'Mailchimp', 'Brevo', 'Hubspot', 'Salesforce',
73 'Convertkit', 'Aweber', 'Getresponse', 'Activecampaign',
74 'Constantcontact', 'Omnisend', 'Privy', 'Yotpo', 'Attentive'
75 ];
76
77 emailTools.forEach(tool => {
78 const shopifyKey = `shopifyWith${tool}`;
79 const wordpressKey = `wordpressWith${tool}`;
80
81 if (results[shopifyKey].length > 0) {
82 formatted[`shopify_avec_${tool.toLowerCase()}`] = results[shopifyKey];
83 }
84
85 if (results[wordpressKey].length > 0) {
86 formatted[`wordpress_avec_${tool.toLowerCase()}`] = results[wordpressKey];
87 }
88 });
89
90 return formatted;
91}
92
93/**
94 * Sleep function for delays
95 */
96export function sleep(ms) {
97 return new Promise(resolve => setTimeout(resolve, ms));
98}
99
100/**
101 * Normalize URL (remove trailing slash, www, etc.)
102 */
103export function normalizeUrl(url) {
104 try {
105 const urlObj = new URL(url);
106 let hostname = urlObj.hostname.toLowerCase();
107
108 // Remove www.
109 if (hostname.startsWith('www.')) {
110 hostname = hostname.substring(4);
111 }
112
113 // Remove trailing slash from pathname
114 let pathname = urlObj.pathname;
115 if (pathname.endsWith('/') && pathname.length > 1) {
116 pathname = pathname.slice(0, -1);
117 }
118
119 return `${urlObj.protocol}//${hostname}${pathname}`;
120 } catch (error) {
121 return url;
122 }
123}
124
125/**
126 * Remove duplicates from array
127 */
128export function removeDuplicates(array) {
129 return [...new Set(array)];
130}
131
132/**
133 * Extract domain from URL
134 */
135export function extractDomain(url) {
136 try {
137 const urlObj = new URL(url);
138 return urlObj.hostname.toLowerCase().replace('www.', '');
139 } catch (error) {
140 return null;
141 }
142}
143
144/**
145 * Check if URL is valid
146 */
147export function isValidUrl(string) {
148 try {
149 new URL(string);
150 return true;
151 } catch (_) {
152 return false;
153 }
154}
155
156/**
157 * Generate random delay between min and max milliseconds
158 */
159export function randomDelay(min = 1000, max = 3000) {
160 return Math.floor(Math.random() * (max - min + 1)) + min;
161}
162
163/**
164 * Retry function with exponential backoff
165 */
166export async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {
167 for (let attempt = 1; attempt <= maxRetries; attempt++) {
168 try {
169 return await fn();
170 } catch (error) {
171 if (attempt === maxRetries) {
172 throw error;
173 }
174
175 const delay = baseDelay * Math.pow(2, attempt - 1);
176 await sleep(delay);
177 }
178 }
179}
180
181/**
182 * Clean and validate keyword
183 */
184export function cleanKeyword(keyword) {
185 if (!keyword || typeof keyword !== 'string') {
186 return null;
187 }
188
189 return keyword.trim().toLowerCase();
190}
191
192/**
193 * Generate safe filename from keyword
194 */
195export function generateSafeFilename(keyword) {
196 return keyword
197 .replace(/[^a-zA-Z0-9]/g, '_')
198 .replace(/_+/g, '_')
199 .replace(/^_|_$/g, '')
200 .toLowerCase();
201}
202
203/**
204 * Calculate detection confidence score
205 */
206export function calculateConfidenceScore(detectionMethods, weights) {
207 let totalScore = 0;
208 let maxPossibleScore = 0;
209
210 for (const [method, detected] of Object.entries(detectionMethods)) {
211 const weight = weights[method] || 1;
212 maxPossibleScore += weight;
213
214 if (detected) {
215 totalScore += weight;
216 }
217 }
218
219 return maxPossibleScore > 0 ? (totalScore / maxPossibleScore) * 100 : 0;
220}
221
222/**
223 * Log statistics
224 */
225export function logStatistics(results, keyword) {
226 const stats = {
227 keyword,
228 timestamp: new Date().toISOString(),
229 totalAnalyzed: results.shopify.length + results.wordpress.length + results.autres.length,
230 shopify: results.shopify.length,
231 wordpress: results.wordpress.length,
232 other: results.autres.length,
233 shopifyPercentage: 0,
234 wordpressPercentage: 0
235 };
236
237 if (stats.totalAnalyzed > 0) {
238 stats.shopifyPercentage = Math.round((stats.shopify / stats.totalAnalyzed) * 100);
239 stats.wordpressPercentage = Math.round((stats.wordpress / stats.totalAnalyzed) * 100);
240 }
241
242 console.log(`📊 Statistics for "${keyword}":`, stats);
243 return stats;
244}

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
"name": "shopify-wordpress-crawler",
"version": "2.0.0",
"description": "Enhanced Apify Actor for analyzing Shopify and WordPress usage with improved detection",
"main": "main.js",
"type": "module",
"scripts": {
"start": "node main.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [
"apify",
"shopify",
"wordpress",
"crawler",
"web-scraping",
"ecommerce",
"cms-detection"
],
"author": "Your Name",
"license": "MIT",
"dependencies": {
"apify": "^3.1.0",
"crawlee": "^3.5.0",
"puppeteer": "^21.0.0"
},
"repository": {
"type": "git",
"url": "https://github.com/your-username/shopify-wordpress-crawler"
}
}