Recherche CMS
Under maintenance
Pricing
Pay per usage
Go to Store
Recherche CMS
Under maintenance
0.0 (0)
Pricing
Pay per usage
1
Total users
4
Monthly users
2
Runs succeeded
6.7%
Last modified
a day ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-1", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Analyse CMS et Marketing Email - Version Améliorée", "type": "object", "schemaVersion": 1, "description": "Analysez les sites web pour détecter leur CMS (Shopify/WordPress) et leurs outils d'email marketing avec une détection Shopify améliorée", "properties": { "keywords": { "title": "Termes de recherche", "type": "array", "description": "Entrez un ou plusieurs mots-clés à rechercher sur Google (un par ligne)", "editor": "stringList", "default": ["vêtements", "mode"], "sectionCaption": "Paramètres de recherche", "sectionDescription": "Configurez votre recherche Google" }, "maxPages": { "title": "Nombre de pages Google par mot-clé", "type": "integer", "description": "Nombre de pages de résultats Google à analyser pour chaque mot-clé", "default": 3, "minimum": 1, "maximum": 10, "unit": "page(s)" }, "maxUrlsToAnalyze": { "title": "Nombre de sites à analyser par mot-clé", "type": "integer", "description": "Combien de sites souhaitez-vous scanner pour chaque mot-clé", "default": 30, "minimum": 5, "maximum": 100, "unit": "site(s)" }, "country": { "title": "Pays pour les résultats", "type": "string", "description": "Choisissez le pays pour lequel vous souhaitez voir les résultats Google", "editor": "select", "default": "fr", "enum": ["fr", "be", "ch", "ca", "ma", "us", "gb"], "enumTitles": ["France", "Belgique", "Suisse", "Canada", "Maroc", "États-Unis", "Royaume-Uni"] } }, "required": ["keywords"]}
src/config.js
1/**2 * Enhanced configuration for the crawler3 */4export const CONFIG = {5 // Search parameters6 DEFAULT_COUNTRY: 'fr',7 MAX_PAGES_PER_KEYWORD: 3,8 MAX_URLS_TO_ANALYZE: 30,9 10 // Google search settings11 GOOGLE_DOMAINS: {12 'fr': 'google.fr',13 'be': 'google.be', 14 'ch': 'google.ch',15 'ca': 'google.ca',16 'ma': 'google.co.ma',17 'us': 'google.com',18 'gb': 'google.co.uk'19 },20 21 // Browser configuration22 BROWSER_OPTIONS: {23 headless: true,24 args: [25 '--no-sandbox',26 '--disable-setuid-sandbox',27 '--disable-dev-shm-usage',28 '--disable-accelerated-2d-canvas',29 '--no-first-run',30 '--no-zygote',31 '--disable-gpu',32 '--window-size=1366,768',33 '--hide-scrollbars',34 '--disable-notifications',35 '--disable-background-timer-throttling',36 '--disable-backgrounding-occluded-windows',37 '--disable-renderer-backgrounding',38 '--disable-features=TranslateUI',39 '--disable-ipc-flooding-protection'40 ]41 },42 43 // Crawler settings44 CRAWLER_OPTIONS: {45 maxConcurrency: 3,46 navigationTimeoutSecs: 45,47 maxRequestRetries: 248 },49 50 // User agents for rotation51 USER_AGENTS: [52 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',53 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',54 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',55 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',56 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'57 ],58 59 // Enhanced Shopify detection indicators60 SHOPIFY_INDICATORS: {61 // URL patterns62 urls: [63 'myshopify.com',64 'shopifycdn.com',65 'shopifycs.com',66 'shopify-analytics.com',67 'monorail-edge.shopifysvc.com'68 ],69 70 // HTTP headers71 headers: [72 'x-shopify-stage',73 'x-shopify-shop-api-call-limit',74 'x-shopify-request-id',75 'x-shopify-shop-domain',76 'x-shopify-api-version'77 ],78 79 // JavaScript objects80 javascript: [81 'Shopify',82 'Shopify.theme',83 'Shopify.shop',84 'Shopify.currency',85 'ShopifyAnalytics',86 'Shopify.checkout',87 'Shopify.routes'88 ],89 90 // DOM elements and classes91 dom: [92 'shopify-section',93 'shopify-policy-list',94 'shopify-product-form',95 'shopify-payment-button',96 'shopify-cleanslate',97 'shopify-product-reviews-badge'98 ],99 100 // Meta tags101 meta: [102 'shopify-digital-wallet',103 'shopify-checkout-api-token',104 'shopify-checkout-domain'105 ],106 107 // Script and CSS sources108 assets: [109 'assets/theme.js',110 'assets/option_selection.js',111 'assets/theme.css',112 '/wpm@',113 'cdn.shopify.com',114 'v3.shopifycdn.com'115 ],116 117 // API endpoints118 endpoints: [119 '/products.json',120 '/cart.js',121 '/collections.json',122 '/cart/add',123 '/cart/update',124 '/api/2023-01/',125 '/api/2024-01/'126 ],127 128 // Text patterns in HTML129 textPatterns: [130 'powered by shopify',131 'shopify-checkout',132 'shopify-features',133 'shopify-money-format',134 'var shopifycdn'135 ]136 },137 138 // Enhanced WordPress detection indicators 139 WORDPRESS_INDICATORS: {140 // URL patterns141 urls: [142 'wp-content',143 'wp-includes', 144 'wp-admin',145 'wordpress.com',146 'wp-json'147 ],148 149 // HTTP headers150 headers: [151 'x-pingback',152 'x-powered-by-wordpress'153 ],154 155 // DOM elements and classes156 dom: [157 'wp-block',158 'wp-container',159 'wp-site-blocks',160 'wp-block-group',161 'wp-block-column',162 'wp-block-paragraph',163 'wp-embed',164 'wp-caption',165 'wp-image'166 ],167 168 // Meta tags169 meta: [170 'generator',171 'wlwmanifest'172 ],173 174 // Script and CSS sources175 assets: [176 'wp-content/themes',177 'wp-content/plugins',178 'wp-includes/js',179 'wp-includes/css',180 'wp-emoji-release.min.js'181 ],182 183 // API endpoints184 endpoints: [185 '/wp-json/',186 '/wp-json/wp/v2/',187 '/xmlrpc.php',188 '/?rest_route=/',189 '/wp-admin/admin-ajax.php'190 ],191 192 // Text patterns in HTML193 textPatterns: [194 'powered by wordpress',195 'wp-json',196 'wp_enqueue_script',197 'wp_head',198 'wordpress'199 ]200 },201 202 // Email marketing tools detection203 EMAIL_TOOLS: {204 klaviyo: [205 'klaviyo',206 'klaviyo.com',207 'static.klaviyo.com',208 'kla.js'209 ],210 mailchimp: [211 'mailchimp',212 'mc.us',213 'chimpstatic.com',214 'mc4wp',215 'mailchimp-woocommerce'216 ],217 brevo: [218 'brevo',219 'sendinblue',220 'sibforms',221 'mailin.fr',222 'sib-api'223 ],224 hubspot: [225 'hubspot',226 'hs-scripts.com',227 'hsforms.net',228 'hubapi.com'229 ],230 salesforce: [231 'salesforce',232 'pardot',233 'exacttarget',234 'marketingcloud'235 ],236 convertkit: [237 'convertkit',238 'ck.page',239 'convertkit-mail'240 ],241 aweber: [242 'aweber',243 'aweber.com',244 'forms.aweber.com'245 ],246 getresponse: [247 'getresponse',248 'getresponse.com',249 'gr-cdn.com'250 ],251 activecampaign: [252 'activecampaign',253 'activehosted.com',254 'trackcmp.net'255 ],256 constantcontact: [257 'constantcontact',258 'ctctcdn.com',259 'constantcontacts.com'260 ],261 omnisend: [262 'omnisend',263 'omnisrc.com',264 'omnisend.com'265 ],266 privy: [267 'privy',268 'privy.com',269 'widget.privy.com'270 ],271 yotpo: [272 'yotpo',273 'staticw2.yotpo.com',274 'yotpo.com'275 ],276 attentive: [277 'attentive',278 'attentivemobile.com',279 'attn.tv'280 ]281 },282 283 // Fallback URLs for testing284 FALLBACK_URLS: [285 'https://www.shopify.com',286 'https://wordpress.org',287 'https://example.com'288 ],289 290 // Delays and timeouts291 DELAYS: {292 betweenRequests: 1000,293 pageLoad: 3000,294 networkIdle: 5000,295 apiTest: 10000296 },297 298 // Detection weights (for scoring system)299 DETECTION_WEIGHTS: {300 url: 10,301 headers: 9,302 javascript: 8,303 meta: 7,304 dom: 6,305 assets: 5,306 endpoints: 8,307 textPatterns: 4308 }309};310 '
src/googleSearch.js
1/**2 * Google search functionality3 */4import { log } from 'crawlee';5import { PuppeteerCrawler } from 'crawlee';6import { getRandomUserAgent, normalizeUrl, removeDuplicates, sleep } from './utils.js';7
8/**9 * Search Google for URLs based on keyword10 */11export async function searchGoogleForUrls(keyword, country, config) {12 log.info(`🔍 Starting Google search for: "${keyword}" in ${country}`);13 14 const googleDomain = config.GOOGLE_DOMAINS[country] || config.GOOGLE_DOMAINS['fr'];15 const maxPages = config.MAX_PAGES_PER_KEYWORD || 3;16 17 let allUrls = [];18 19 // Create a crawler for Google search20 const searchCrawler = new PuppeteerCrawler({21 launchContext: {22 launchOptions: config.BROWSER_OPTIONS,23 },24 maxConcurrency: 1, // Keep it low to avoid being blocked25 navigationTimeoutSecs: 30,26 maxRequestRetries: 2,27 28 async requestHandler({ page, request }) {29 try {30 const pageNumber = request.userData.pageNumber || 1;31 log.info(`📄 Searching Google page ${pageNumber} for "${keyword}"`);32 33 // Set user agent34 const userAgent = getRandomUserAgent(config.USER_AGENTS);35 await page.setUserAgent(userAgent);36 await page.setViewport({ width: 1366, height: 768 });37 38 // Add some randomness to avoid detection39 await page.evaluateOnNewDocument(() => {40 // Override the navigator.webdriver property41 Object.defineProperty(navigator, 'webdriver', {42 get: () => undefined,43 });44 45 // Override the chrome property46 window.chrome = {47 runtime: {},48 loadTimes: function() {},49 csi: function() {},50 app: {}51 };52 53 // Override plugins54 Object.defineProperty(navigator, 'plugins', {55 get: () => [1, 2, 3, 4, 5],56 });57 58 // Override languages59 Object.defineProperty(navigator, 'languages', {60 get: () => ['fr-FR', 'fr', 'en'],61 });62 });63 64 // Navigate to Google search65 const searchUrl = request.url;66 log.info(`🌐 Navigating to: ${searchUrl}`);67 68 await page.goto(searchUrl, {69 waitUntil: 'networkidle2',70 timeout: 3000071 });72 73 // Wait for search results to load74 await page.waitForTimeout(2000);75 76 // Handle cookie consent if present77 try {78 const acceptButton = await page.$('button[id*="accept"], button[id*="consent"], #L2AGLb');79 if (acceptButton) {80 await acceptButton.click();81 await page.waitForTimeout(1000);82 }83 } catch (e) {84 log.info('No cookie consent found or already handled');85 }86 87 // Extract search result URLs88 const urls = await page.evaluate(() => {89 const results = [];90 91 // Multiple selectors for different Google layouts92 const selectors = [93 'a[href^="http"]:not([href*="google."])',94 'div.g a[href^="http"]',95 'div[data-ved] a[href^="http"]',96 '.rc a[href^="http"]',97 'div.yuRUbf a[href^="http"]'98 ];99 100 selectors.forEach(selector => {101 const elements = document.querySelectorAll(selector);102 elements.forEach(element => {103 const href = element.href;104 if (href && 105 !href.includes('google.') && 106 !href.includes('youtube.com') &&107 !href.includes('facebook.com') &&108 !href.includes('instagram.com') &&109 !href.includes('twitter.com') &&110 !href.includes('linkedin.com') &&111 !href.includes('wikipedia.org') &&112 !href.includes('webcache.googleusercontent.com')) {113 results.push(href);114 }115 });116 });117 118 return results;119 });120 121 log.info(`Found ${urls.length} URLs on page ${pageNumber}`);122 123 // Add URLs to the collection124 allUrls.push(...urls);125 126 } catch (error) {127 log.error(`Error searching Google page: ${error.message}`);128 throw error;129 }130 },131 132 failedRequestHandler({ request, error }) {133 log.error(`Google search request failed: ${error.message}`);134 }135 });136 137 // Generate search URLs for each page138 const searchUrls = [];139 for (let page = 1; page <= maxPages; page++) {140 const start = (page - 1) * 10;141 const searchUrl = `https://${googleDomain}/search?q=${encodeURIComponent(keyword)}&start=${start}&num=10&hl=fr&gl=${country.toLowerCase()}`;142 143 searchUrls.push({144 url: searchUrl,145 userData: { pageNumber: page }146 });147 }148 149 // Add search requests to crawler150 await searchCrawler.addRequests(searchUrls);151 152 // Run the search crawler153 await searchCrawler.run();154 155 // Clean and deduplicate URLs156 let cleanedUrls = allUrls157 .map(url => {158 try {159 // Clean Google redirect URLs160 if (url.includes('google.') && url.includes('url?q=')) {161 const urlParams = new URLSearchParams(url.split('?')[1]);162 return urlParams.get('q') || url;163 }164 return url;165 } catch (e) {166 return url;167 }168 })169 .filter(url => {170 try {171 new URL(url);172 return true;173 } catch (e) {174 return false;175 }176 })177 .map(url => normalizeUrl(url));178 179 // Remove duplicates180 cleanedUrls = removeDuplicates(cleanedUrls);181 182 log.info(`✅ Google search completed. Found ${cleanedUrls.length} unique URLs for "${keyword}"`);183 184 // Log some sample URLs for debugging185 if (cleanedUrls.length > 0) {186 log.info(`Sample URLs found: ${cleanedUrls.slice(0, 3).join(', ')}`);187 }188 189 return cleanedUrls;190}191
192/**193 * Alternative Google search using direct HTTP requests (fallback)194 */195export async function searchGoogleAlternative(keyword, country, config) {196 log.info(`🔄 Using alternative Google search for: "${keyword}"`);197 198 // This is a simplified version that could be implemented199 // using HTTP requests instead of browser automation200 // For now, return empty array to trigger fallback URLs201 202 return [];203}204
205/**206 * Extract and clean URLs from Google search results HTML207 */208function extractUrlsFromHtml(html) {209 const urls = [];210 211 // Regular expressions to match URLs in Google search results212 const urlPatterns = [213 /href="(https?:\/\/[^"]+)"/g,214 /data-href="(https?:\/\/[^"]+)"/g215 ];216 217 urlPatterns.forEach(pattern => {218 let match;219 while ((match = pattern.exec(html)) !== null) {220 const url = match[1];221 222 // Filter out Google URLs and other unwanted domains223 if (!url.includes('google.') && 224 !url.includes('youtube.com') &&225 !url.includes('facebook.com') &&226 !url.includes('instagram.com') &&227 !url.includes('twitter.com') &&228 !url.includes('linkedin.com') &&229 !url.includes('wikipedia.org')) {230 urls.push(url);231 }232 }233 });234 235 return urls;236}237
238/**239 * Generate search query with variations240 */241export function generateSearchQueries(baseKeyword) {242 const queries = [baseKeyword];243 244 // Add some variations to get more diverse results245 queries.push(`"${baseKeyword}" site:*.com`);246 queries.push(`${baseKeyword} boutique`);247 queries.push(`${baseKeyword} shop`);248 queries.push(`${baseKeyword} store`);249 250 return queries;251}252
253/**254 * Validate and filter URLs255 */256function isValidSearchResult(url) {257 try {258 const urlObj = new URL(url);259 const hostname = urlObj.hostname.toLowerCase();260 261 // Exclude certain domains262 const excludedDomains = [263 'google.',264 'youtube.com',265 'facebook.com',266 'instagram.com',267 'twitter.com',268 'linkedin.com',269 'wikipedia.org',270 'amazon.',271 'ebay.',272 'aliexpress.com',273 'alibaba.com'274 ];275 276 return !excludedDomains.some(domain => hostname.includes(domain));277 } catch (e) {278 return false;279 }280}
src/main.js
1/**2 * Apify Actor for analyzing Shopify and WordPress usage from Google search results3 * Enhanced version with improved Shopify detection4 */5import { Actor } from 'apify';6import { PuppeteerCrawler, log } from 'crawlee';7import { CONFIG } from './src/config.js';8import { 9 getRandomUserAgent, 10 initializeResults, 11 formatKeywordResults 12} from './src/utils.js';13import { searchGoogleForUrls } from './src/googleSearch.js';14import { 15 detectTechnologies, 16 detectEmailTools, 17 preparePage, 18 navigateToUrl 19} from './src/siteAnalyzer.js';20
21// Main Actor function22await Actor.main(async () => {23 try {24 log.info('Starting the enhanced Shopify/WordPress analyzer');25 26 // Check if we're in test mode27 const isInTestMode = process.env.APIFY_IS_AT_HOME;28 29 // Get and process input parameters30 const input = await Actor.getInput() || {};31 const { 32 keywords = ['test'],33 country = CONFIG.DEFAULT_COUNTRY, 34 maxPages = isInTestMode ? 1 : CONFIG.MAX_PAGES_PER_KEYWORD,35 maxUrlsToAnalyze = isInTestMode ? 3 : CONFIG.MAX_URLS_TO_ANALYZE36 } = input;37 38 // Validate input39 if (!keywords || !Array.isArray(keywords) || keywords.length === 0) {40 throw new Error('At least one keyword is required');41 }42 43 log.info(`Analyzing ${keywords.length} keywords with up to ${maxUrlsToAnalyze} URLs per keyword`);44 45 // Results container for all keywords46 const allResults = {};47 48 // Process each keyword49 for (const keyword of keywords) {50 // Skip empty keywords51 if (!keyword || keyword.trim() === '') {52 log.warning('Empty keyword detected, skipping');53 continue;54 }55 56 log.info(`\n========== ANALYZING KEYWORD: ${keyword} ==========`);57 58 // Step 1: Search Google and get URLs59 let urlsToAnalyze = [];60 try {61 const searchResults = await searchGoogleForUrls(keyword, country, CONFIG);62 63 if (searchResults.length > 0) {64 // Take only the specified number of URLs65 urlsToAnalyze = searchResults.slice(0, maxUrlsToAnalyze);66 log.info(`Found ${urlsToAnalyze.length} URLs to analyze for "${keyword}"`);67 } else {68 log.warning(`No URLs found for "${keyword}", using fallback URLs`);69 urlsToAnalyze = [...CONFIG.FALLBACK_URLS];70 }71 } catch (searchError) {72 log.error(`Search failed for "${keyword}": ${searchError.message}`);73 urlsToAnalyze = [...CONFIG.FALLBACK_URLS];74 log.info(`Using ${urlsToAnalyze.length} fallback URLs`);75 }76 77 // Skip if no URLs were found78 if (urlsToAnalyze.length === 0) {79 log.info(`No URLs to analyze for "${keyword}", skipping`);80 allResults[keyword] = {81 message: "No URLs found for this search",82 timestamp: new Date().toISOString()83 };84 continue;85 }86 87 // Initialize results for this keyword88 const results = initializeResults();89 90 // Set up proxy configuration for the crawler91 const proxyConfig = isInTestMode ? 92 undefined : 93 await Actor.createProxyConfiguration({94 useApifyProxy: true,95 apifyProxyGroups: ['RESIDENTIAL']96 });97 98 // Step 2: Set up crawler to analyze each URL99 const crawler = new PuppeteerCrawler({100 proxyConfiguration: proxyConfig,101 launchContext: {102 launchOptions: CONFIG.BROWSER_OPTIONS,103 },104 maxConcurrency: CONFIG.CRAWLER_OPTIONS.maxConcurrency,105 navigationTimeoutSecs: CONFIG.CRAWLER_OPTIONS.navigationTimeoutSecs,106 maxRequestRetries: CONFIG.CRAWLER_OPTIONS.maxRequestRetries,107 108 // Handle each URL109 async requestHandler({ page, request }) {110 const url = request.url;111 log.info(`Analyzing: ${url}`);112 113 try {114 // Prepare the page115 const userAgent = getRandomUserAgent(CONFIG.USER_AGENTS);116 await preparePage(page, userAgent);117 118 // Navigate to the URL119 const navigationSuccessful = await navigateToUrl(page, url);120 121 if (!navigationSuccessful) {122 throw new Error("Failed to load page after multiple attempts");123 }124 125 // Enhanced detection with multiple passes126 log.info(`🔍 Starting enhanced detection for ${url}`);127 128 // Detect technologies (Shopify, WordPress) with enhanced methods129 const siteInfo = await detectTechnologies(page, url, CONFIG);130 131 // Detect email marketing tools132 const emailInfo = await detectEmailTools(page, CONFIG.EMAIL_TOOLS);133 134 // Log detection details135 if (siteInfo.detectionMethod) {136 log.info(`✅ Detection method: ${siteInfo.detectionMethod}`);137 }138 139 // Categorize the site140 if (siteInfo.isShopify) {141 results.shopify.push(url);142 143 // Check which email tools are used144 for (const tool in emailInfo) {145 if (emailInfo[tool]) {146 const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);147 results[`shopifyWith${capitalizedTool}`].push(url);148 }149 }150 151 log.info(`✅ ${url}: Shopify detected (${siteInfo.detectionMethod || 'standard'})`);152 } else if (siteInfo.isWordPress) {153 results.wordpress.push(url);154 155 // Check which email tools are used156 for (const tool in emailInfo) {157 if (emailInfo[tool]) {158 const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);159 results[`wordpressWith${capitalizedTool}`].push(url);160 }161 }162 163 log.info(`✅ ${url}: WordPress detected (${siteInfo.detectionMethod || 'standard'})`);164 } else {165 results.autres.push(url);166 log.info(`✅ ${url}: Other platform or undetected`);167 }168 169 } catch (error) {170 log.error(`❌ Error analyzing ${url}: ${error.message}`);171 172 // Enhanced URL-based fallback detection173 const urlLower = url.toLowerCase();174 if (urlLower.includes('shopify') || 175 urlLower.includes('myshopify') ||176 urlLower.includes('shopifycdn') ||177 urlLower.includes('shopifycs')) {178 results.shopify.push(url);179 log.info(`✅ ${url}: Shopify (detected via URL fallback)`);180 } else if (urlLower.includes('wordpress') || 181 urlLower.includes('wp-') ||182 urlLower.includes('wp-content') ||183 urlLower.includes('wp-includes')) {184 results.wordpress.push(url);185 log.info(`✅ ${url}: WordPress (detected via URL fallback)`);186 } else {187 results.autres.push(url);188 }189 }190 },191 192 // Handle failures193 failedRequestHandler({ request, error }) {194 log.error(`Request ${request.url} failed: ${error.message}`);195 }196 });197 198 // Add URLs to the crawler199 for (const url of urlsToAnalyze) {200 await crawler.addRequests([{ url }]);201 }202 203 // Run the crawler for this keyword204 await crawler.run();205 206 // Format results and store them207 allResults[keyword] = formatKeywordResults(results);208 209 log.info(`Analysis complete for "${keyword}". Sites analyzed: ${urlsToAnalyze.length}`);210 log.info(`Results: Shopify: ${results.shopify.length}, WordPress: ${results.wordpress.length}, Other: ${results.autres.length}`);211 }212 213 // Save final results to dataset214 await Actor.pushData(allResults);215 216 log.info(`\n========== ANALYSIS COMPLETE ==========`);217 log.info(`Keywords analyzed: ${keywords.length}`);218 219 // Log summary statistics220 let totalShopify = 0, totalWordPress = 0, totalOther = 0;221 for (const keyword in allResults) {222 if (allResults[keyword].shopify) {223 totalShopify += allResults[keyword].shopify.length;224 totalWordPress += allResults[keyword].wordpress.length;225 totalOther += allResults[keyword].autres.length;226 }227 }228 229 log.info(`FINAL STATS - Shopify: ${totalShopify}, WordPress: ${totalWordPress}, Other: ${totalOther}`);230 231 } catch (error) {232 log.error(`Main error: ${error.message}`);233 await Actor.pushData({234 error: error.message,235 stackTrace: error.stack,236 timestamp: new Date().toISOString()237 });238 }239});
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://apify.com/*'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 const title = await page.title();15 log.info(`${title}`, { url: request.loadedUrl });16
17 await Dataset.pushData({18 url: request.loadedUrl,19 title,20 });21});
src/siteAnalyzer.js
1/**2 * Enhanced site analyzer with improved Shopify and WordPress detection3 */4import { log } from 'crawlee';5
6/**7 * Prepare page with user agent and other settings8 */9export async function preparePage(page, userAgent) {10 try {11 await page.setUserAgent(userAgent);12 await page.setViewport({ width: 1366, height: 768 });13 14 // Block images and fonts to speed up loading15 await page.setRequestInterception(true);16 page.on('request', (req) => {17 const resourceType = req.resourceType();18 if (resourceType === 'image' || resourceType === 'font' || resourceType === 'media') {19 req.abort();20 } else {21 req.continue();22 }23 });24 25 // Set extra headers26 await page.setExtraHTTPHeaders({27 'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',28 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'29 });30 31 } catch (error) {32 log.warning(`Failed to prepare page: ${error.message}`);33 }34}35
36/**37 * Navigate to URL with retries38 */39export async function navigateToUrl(page, url, maxRetries = 3) {40 for (let attempt = 1; attempt <= maxRetries; attempt++) {41 try {42 log.info(`Navigation attempt ${attempt}/${maxRetries} for ${url}`);43 44 const response = await page.goto(url, {45 waitUntil: 'networkidle2',46 timeout: 3000047 });48 49 if (response && response.status() < 400) {50 return true;51 }52 53 log.warning(`HTTP ${response?.status()} for ${url}, attempt ${attempt}`);54 55 } catch (error) {56 log.warning(`Navigation attempt ${attempt} failed for ${url}: ${error.message}`);57 58 if (attempt === maxRetries) {59 return false;60 }61 62 // Wait before retry63 await new Promise(resolve => setTimeout(resolve, 2000 * attempt));64 }65 }66 67 return false;68}69
70/**71 * Enhanced technology detection with multiple detection methods72 */73export async function detectTechnologies(page, url, config) {74 const result = {75 isShopify: false,76 isWordPress: false,77 detectionMethod: null78 };79 80 try {81 // Method 1: Quick URL-based detection82 const urlLower = url.toLowerCase();83 if (urlLower.includes('myshopify.com') || 84 urlLower.includes('shopifycdn.com') ||85 urlLower.includes('shopifycs.com')) {86 result.isShopify = true;87 result.detectionMethod = 'URL';88 return result;89 }90 91 if (urlLower.includes('wordpress.com') || 92 urlLower.includes('wp-content') ||93 urlLower.includes('wp-includes')) {94 result.isWordPress = true;95 result.detectionMethod = 'URL';96 return result;97 }98 99 // Method 2: HTTP Headers detection100 const response = await page.goto(url, { waitUntil: 'networkidle2' });101 const headers = response?.headers() || {};102 103 // Check Shopify-specific headers104 if (headers['x-shopify-stage'] || 105 headers['x-shopify-shop-api-call-limit'] ||106 headers['x-shopify-request-id'] ||107 (headers['server'] && headers['server'].includes('nginx/1.14.2'))) {108 result.isShopify = true;109 result.detectionMethod = 'HTTP Headers';110 return result;111 }112 113 // Check WordPress headers114 if (headers['x-pingback'] || 115 (headers['link'] && headers['link'].includes('wp-json'))) {116 result.isWordPress = true;117 result.detectionMethod = 'HTTP Headers';118 return result;119 }120 121 // Method 3: Wait for page to fully load122 await page.waitForTimeout(3000);123 await page.waitForLoadState?.('networkidle') || Promise.resolve();124 125 // Method 4: Enhanced DOM-based detection126 const detectionResult = await page.evaluate(() => {127 const shopifyIndicators = [128 // JavaScript objects129 'Shopify',130 'Shopify.theme',131 'Shopify.shop',132 'Shopify.currency',133 'ShopifyAnalytics',134 'window.Shopify',135 136 // Meta tags137 'shopify-digital-wallet',138 'shopify-checkout-api-token',139 'theme-color',140 141 // CSS classes and IDs142 'shopify-section',143 'shopify-policy-list',144 'shopify-product-form',145 146 // Script sources147 'shopifycdn.com',148 'assets/theme.js',149 'assets/option_selection.js',150 '/wpm@',151 'monorail-edge.shopifysvc.com'152 ];153 154 const wordpressIndicators = [155 'wp-content',156 'wp-includes',157 'wp-admin',158 'wordpress',159 '/wp-json/',160 'wp-embed',161 'wlwmanifest',162 'xmlrpc.php'163 ];164 165 let shopifyDetected = false;166 let wordpressDetected = false;167 let method = 'DOM';168 169 // Check JavaScript objects170 if (typeof window.Shopify !== 'undefined' ||171 typeof window.ShopifyAnalytics !== 'undefined') {172 shopifyDetected = true;173 method = 'JavaScript objects';174 }175 176 // Check document content177 const htmlContent = document.documentElement.outerHTML.toLowerCase();178 179 // Shopify detection in HTML180 for (const indicator of shopifyIndicators) {181 if (htmlContent.includes(indicator.toLowerCase())) {182 shopifyDetected = true;183 if (method === 'DOM') method = `DOM (${indicator})`;184 break;185 }186 }187 188 // WordPress detection in HTML189 if (!shopifyDetected) {190 for (const indicator of wordpressIndicators) {191 if (htmlContent.includes(indicator.toLowerCase())) {192 wordpressDetected = true;193 if (method === 'DOM') method = `DOM (${indicator})`;194 break;195 }196 }197 }198 199 // Check meta generators200 const generators = document.querySelectorAll('meta[name="generator"]');201 generators.forEach(gen => {202 const content = gen.getAttribute('content')?.toLowerCase() || '';203 if (content.includes('shopify')) {204 shopifyDetected = true;205 method = 'Meta generator';206 } else if (content.includes('wordpress')) {207 wordpressDetected = true;208 method = 'Meta generator';209 }210 });211 212 // Check for specific CSS files213 const links = document.querySelectorAll('link[rel="stylesheet"]');214 links.forEach(link => {215 const href = link.getAttribute('href')?.toLowerCase() || '';216 if (href.includes('shopifycdn.com') || href.includes('assets/theme.css')) {217 shopifyDetected = true;218 method = 'CSS links';219 } else if (href.includes('wp-content') || href.includes('wp-includes')) {220 wordpressDetected = true;221 method = 'CSS links';222 }223 });224 225 return {226 shopify: shopifyDetected,227 wordpress: wordpressDetected,228 method: method229 };230 });231 232 if (detectionResult.shopify) {233 result.isShopify = true;234 result.detectionMethod = detectionResult.method;235 return result;236 }237 238 if (detectionResult.wordpress) {239 result.isWordPress = true;240 result.detectionMethod = detectionResult.method;241 return result;242 }243 244 // Method 5: Network requests analysis245 log.info(`Checking network requests for ${url}`);246 const networkDetection = await checkNetworkRequests(page, url);247 if (networkDetection.isShopify) {248 result.isShopify = true;249 result.detectionMethod = 'Network requests';250 return result;251 }252 253 if (networkDetection.isWordPress) {254 result.isWordPress = true;255 result.detectionMethod = 'Network requests';256 return result;257 }258 259 // Method 6: API endpoint testing260 const apiDetection = await testApiEndpoints(page, url);261 if (apiDetection.isShopify) {262 result.isShopify = true;263 result.detectionMethod = 'API endpoints';264 return result;265 }266 267 if (apiDetection.isWordPress) {268 result.isWordPress = true;269 result.detectionMethod = 'API endpoints';270 return result;271 }272 273 } catch (error) {274 log.error(`Detection error for ${url}: ${error.message}`);275 }276 277 return result;278}279
280/**281 * Check network requests for platform indicators282 */283async function checkNetworkRequests(page, url) {284 return new Promise((resolve) => {285 let shopifyDetected = false;286 let wordpressDetected = false;287 288 const timeout = setTimeout(() => {289 resolve({ isShopify: shopifyDetected, isWordPress: wordpressDetected });290 }, 5000);291 292 const requestHandler = (request) => {293 const requestUrl = request.url().toLowerCase();294 295 if (requestUrl.includes('shopifycdn.com') ||296 requestUrl.includes('shopifycs.com') ||297 requestUrl.includes('monorail-edge.shopifysvc.com') ||298 requestUrl.includes('/cart/add') ||299 requestUrl.includes('/products.json')) {300 shopifyDetected = true;301 clearTimeout(timeout);302 page.off('request', requestHandler);303 resolve({ isShopify: true, isWordPress: false });304 } else if (requestUrl.includes('wp-content') ||305 requestUrl.includes('wp-includes') ||306 requestUrl.includes('wp-admin') ||307 requestUrl.includes('wp-json')) {308 wordpressDetected = true;309 clearTimeout(timeout);310 page.off('request', requestHandler);311 resolve({ isShopify: false, isWordPress: true });312 }313 };314 315 page.on('request', requestHandler);316 317 // Trigger some page interactions to generate requests318 page.evaluate(() => {319 // Scroll to trigger lazy loading320 window.scrollTo(0, document.body.scrollHeight / 2);321 322 // Try to trigger cart-related requests (Shopify)323 if (window.fetch) {324 fetch('/cart.js').catch(() => {});325 fetch('/products.json?limit=1').catch(() => {});326 }327 }).catch(() => {});328 });329}330
331/**332 * Test common API endpoints333 */334async function testApiEndpoints(page, baseUrl) {335 const result = { isShopify: false, isWordPress: false };336 337 // Shopify endpoints to test338 const shopifyEndpoints = [339 '/products.json',340 '/cart.js',341 '/collections.json',342 '/api/2023-01/products.json'343 ];344 345 // WordPress endpoints to test346 const wordpressEndpoints = [347 '/wp-json/',348 '/wp-json/wp/v2/',349 '/xmlrpc.php',350 '/?rest_route=/'351 ];352 353 try {354 // Test Shopify endpoints355 for (const endpoint of shopifyEndpoints) {356 try {357 const testUrl = new URL(endpoint, baseUrl).href;358 const response = await page.goto(testUrl, { 359 waitUntil: 'networkidle2', 360 timeout: 10000 361 });362 363 if (response && response.status() === 200) {364 const contentType = response.headers()['content-type'] || '';365 if (contentType.includes('application/json')) {366 const text = await response.text();367 if (text.includes('products') || text.includes('handle') || text.includes('variants')) {368 result.isShopify = true;369 return result;370 }371 }372 }373 } catch (e) {374 // Endpoint not available, continue375 }376 }377 378 // Test WordPress endpoints if Shopify not detected379 if (!result.isShopify) {380 for (const endpoint of wordpressEndpoints) {381 try {382 const testUrl = new URL(endpoint, baseUrl).href;383 const response = await page.goto(testUrl, { 384 waitUntil: 'networkidle2', 385 timeout: 10000 386 });387 388 if (response && response.status() === 200) {389 const text = await response.text();390 if (text.includes('wp:') || 391 text.includes('wordpress') ||392 text.includes('wp-json') ||393 text.includes('methodName')) {394 result.isWordPress = true;395 return result;396 }397 }398 } catch (e) {399 // Endpoint not available, continue400 }401 }402 }403 } catch (error) {404 log.warning(`API endpoint testing failed: ${error.message}`);405 }406 407 return result;408}409
410/**411 * Detect email marketing tools412 */413export async function detectEmailTools(page, emailTools) {414 const results = {};415 416 // Initialize all tools as false417 for (const tool of Object.keys(emailTools)) {418 results[tool] = false;419 }420 421 try {422 const detection = await page.evaluate((toolsConfig) => {423 const detected = {};424 const htmlContent = document.documentElement.outerHTML.toLowerCase();425 426 for (const [toolName, indicators] of Object.entries(toolsConfig)) {427 detected[toolName] = false;428 429 for (const indicator of indicators) {430 if (htmlContent.includes(indicator.toLowerCase())) {431 detected[toolName] = true;432 break;433 }434 }435 }436 437 return detected;438 }, emailTools);439 440 Object.assign(results, detection);441 442 } catch (error) {443 log.warning(`Email tools detection failed: ${error.message}`);444 }445 446 return results;447}
src/utils.js
1/**2 * Utility functions for the crawler3 */4
5/**6 * Get random user agent from the list7 */8export function getRandomUserAgent(userAgents) {9 return userAgents[Math.floor(Math.random() * userAgents.length)];10}11
12/**13 * Initialize results structure for a keyword14 */15export function initializeResults() {16 return {17 shopify: [],18 wordpress: [],19 autres: [],20 // Shopify with email tools21 shopifyWithKlaviyo: [],22 shopifyWithMailchimp: [],23 shopifyWithBrevo: [],24 shopifyWithHubspot: [],25 shopifyWithSalesforce: [],26 shopifyWithConvertkit: [],27 shopifyWithAweber: [],28 shopifyWithGetresponse: [],29 shopifyWithActivecampaign: [],30 shopifyWithConstantcontact: [],31 shopifyWithOmnisend: [],32 shopifyWithPrivy: [],33 shopifyWithYotpo: [],34 shopifyWithAttentive: [],35 // WordPress with email tools36 wordpressWithKlaviyo: [],37 wordpressWithMailchimp: [],38 wordpressWithBrevo: [],39 wordpressWithHubspot: [],40 wordpressWithSalesforce: [],41 wordpressWithConvertkit: [],42 wordpressWithAweber: [],43 wordpressWithGetresponse: [],44 wordpressWithActivecampaign: [],45 wordpressWithConstantcontact: [],46 wordpressWithOmnisend: [],47 wordpressWithPrivy: [],48 wordpressWithYotpo: [],49 wordpressWithAttentive: []50 };51}52
53/**54 * Format results for output55 */56export function formatKeywordResults(results) {57 const formatted = {58 timestamp: new Date().toISOString(),59 summary: {60 totalSites: results.shopify.length + results.wordpress.length + results.autres.length,61 shopifyCount: results.shopify.length,62 wordpressCount: results.wordpress.length,63 otherCount: results.autres.length64 },65 shopify: results.shopify,66 wordpress: results.wordpress,67 autres: results.autres68 };69 70 // Add email tool combinations71 const emailTools = [72 'Klaviyo', 'Mailchimp', 'Brevo', 'Hubspot', 'Salesforce',73 'Convertkit', 'Aweber', 'Getresponse', 'Activecampaign',74 'Constantcontact', 'Omnisend', 'Privy', 'Yotpo', 'Attentive'75 ];76 77 emailTools.forEach(tool => {78 const shopifyKey = `shopifyWith${tool}`;79 const wordpressKey = `wordpressWith${tool}`;80 81 if (results[shopifyKey].length > 0) {82 formatted[`shopify_avec_${tool.toLowerCase()}`] = results[shopifyKey];83 }84 85 if (results[wordpressKey].length > 0) {86 formatted[`wordpress_avec_${tool.toLowerCase()}`] = results[wordpressKey];87 }88 });89 90 return formatted;91}92
93/**94 * Sleep function for delays95 */96export function sleep(ms) {97 return new Promise(resolve => setTimeout(resolve, ms));98}99
100/**101 * Normalize URL (remove trailing slash, www, etc.)102 */103export function normalizeUrl(url) {104 try {105 const urlObj = new URL(url);106 let hostname = urlObj.hostname.toLowerCase();107 108 // Remove www.109 if (hostname.startsWith('www.')) {110 hostname = hostname.substring(4);111 }112 113 // Remove trailing slash from pathname114 let pathname = urlObj.pathname;115 if (pathname.endsWith('/') && pathname.length > 1) {116 pathname = pathname.slice(0, -1);117 }118 119 return `${urlObj.protocol}//${hostname}${pathname}`;120 } catch (error) {121 return url;122 }123}124
125/**126 * Remove duplicates from array127 */128export function removeDuplicates(array) {129 return [...new Set(array)];130}131
132/**133 * Extract domain from URL134 */135export function extractDomain(url) {136 try {137 const urlObj = new URL(url);138 return urlObj.hostname.toLowerCase().replace('www.', '');139 } catch (error) {140 return null;141 }142}143
144/**145 * Check if URL is valid146 */147export function isValidUrl(string) {148 try {149 new URL(string);150 return true;151 } catch (_) {152 return false;153 }154}155
156/**157 * Generate random delay between min and max milliseconds158 */159export function randomDelay(min = 1000, max = 3000) {160 return Math.floor(Math.random() * (max - min + 1)) + min;161}162
163/**164 * Retry function with exponential backoff165 */166export async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {167 for (let attempt = 1; attempt <= maxRetries; attempt++) {168 try {169 return await fn();170 } catch (error) {171 if (attempt === maxRetries) {172 throw error;173 }174 175 const delay = baseDelay * Math.pow(2, attempt - 1);176 await sleep(delay);177 }178 }179}180
181/**182 * Clean and validate keyword183 */184export function cleanKeyword(keyword) {185 if (!keyword || typeof keyword !== 'string') {186 return null;187 }188 189 return keyword.trim().toLowerCase();190}191
192/**193 * Generate safe filename from keyword194 */195export function generateSafeFilename(keyword) {196 return keyword197 .replace(/[^a-zA-Z0-9]/g, '_')198 .replace(/_+/g, '_')199 .replace(/^_|_$/g, '')200 .toLowerCase();201}202
203/**204 * Calculate detection confidence score205 */206export function calculateConfidenceScore(detectionMethods, weights) {207 let totalScore = 0;208 let maxPossibleScore = 0;209 210 for (const [method, detected] of Object.entries(detectionMethods)) {211 const weight = weights[method] || 1;212 maxPossibleScore += weight;213 214 if (detected) {215 totalScore += weight;216 }217 }218 219 return maxPossibleScore > 0 ? (totalScore / maxPossibleScore) * 100 : 0;220}221
222/**223 * Log statistics224 */225export function logStatistics(results, keyword) {226 const stats = {227 keyword,228 timestamp: new Date().toISOString(),229 totalAnalyzed: results.shopify.length + results.wordpress.length + results.autres.length,230 shopify: results.shopify.length,231 wordpress: results.wordpress.length,232 other: results.autres.length,233 shopifyPercentage: 0,234 wordpressPercentage: 0235 };236 237 if (stats.totalAnalyzed > 0) {238 stats.shopifyPercentage = Math.round((stats.shopify / stats.totalAnalyzed) * 100);239 stats.wordpressPercentage = Math.round((stats.wordpress / stats.totalAnalyzed) * 100);240 }241 242 console.log(`📊 Statistics for "${keyword}":`, stats);243 return stats;244}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "shopify-wordpress-crawler", "version": "2.0.0", "description": "Enhanced Apify Actor for analyzing Shopify and WordPress usage with improved detection", "main": "main.js", "type": "module", "scripts": { "start": "node main.js", "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": [ "apify", "shopify", "wordpress", "crawler", "web-scraping", "ecommerce", "cms-detection" ], "author": "Your Name", "license": "MIT", "dependencies": { "apify": "^3.1.0", "crawlee": "^3.5.0", "puppeteer": "^21.0.0" }, "repository": { "type": "git", "url": "https://github.com/your-username/shopify-wordpress-crawler" }}