Recherche CMS
Pricing
Pay per usage
Go to Apify Store
Recherche CMS
Under maintenance0.0 (0)
Pricing
Pay per usage
1
5
2
Last modified
5 months ago
Pricing
Pay per usage
0.0 (0)
Pricing
Pay per usage
1
5
2
Last modified
5 months ago
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY  package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \    && npm install --omit=dev --omit=optional \    && echo "Installed NPM packages:" \    && (npm list --omit=dev --all || true) \    && echo "Node.js version:" \    && node --version \    && echo "NPM version:" \    && npm --version \    && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY  . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent{    "actorSpecification": 1,    "name": "my-actor-1",    "title": "Project Puppeteer Crawler JavaScript",    "description": "Crawlee and Puppeteer project in JavaScript.",    "version": "0.0",    "meta": {        "templateId": "js-crawlee-puppeteer-chrome"    },    "input": "./input_schema.json",    "dockerfile": "./Dockerfile"}{    "title": "Analyse CMS et Marketing Email - Version Améliorée",    "type": "object",    "schemaVersion": 1,    "description": "Analysez les sites web pour détecter leur CMS (Shopify/WordPress) et leurs outils d'email marketing avec une détection Shopify améliorée",    "properties": {        "keywords": {            "title": "Termes de recherche",            "type": "array",            "description": "Entrez un ou plusieurs mots-clés à rechercher sur Google (un par ligne)",            "editor": "stringList",            "default": ["vêtements", "mode"],            "sectionCaption": "Paramètres de recherche",            "sectionDescription": "Configurez votre recherche Google"        },        "maxPages": {            "title": "Nombre de pages Google par mot-clé",            "type": "integer",            "description": "Nombre de pages de résultats Google à analyser pour chaque mot-clé",            "default": 3,            "minimum": 1,            "maximum": 10,            "unit": "page(s)"        },        "maxUrlsToAnalyze": {            "title": "Nombre de sites à analyser par mot-clé",            "type": "integer",            "description": "Combien de sites souhaitez-vous scanner pour chaque mot-clé",            "default": 30,            "minimum": 5,            "maximum": 100,            "unit": "site(s)"        },        "country": {            "title": "Pays pour les résultats",            "type": "string",            "description": "Choisissez le pays pour lequel vous souhaitez voir les résultats Google",            "editor": "select",            "default": "fr",            "enum": ["fr", "be", "ch", "ca", "ma", "us", "gb"],            "enumTitles": ["France", "Belgique", "Suisse", "Canada", "Maroc", "États-Unis", "Royaume-Uni"]        }    },    "required": ["keywords"]}1/**2 * Enhanced configuration for the crawler3 */4export const CONFIG = {5    // Search parameters6    DEFAULT_COUNTRY: 'fr',7    MAX_PAGES_PER_KEYWORD: 3,8    MAX_URLS_TO_ANALYZE: 30,9    10    // Google search settings11    GOOGLE_DOMAINS: {12        'fr': 'google.fr',13        'be': 'google.be', 14        'ch': 'google.ch',15        'ca': 'google.ca',16        'ma': 'google.co.ma',17        'us': 'google.com',18        'gb': 'google.co.uk'19    },20    21    // Browser configuration22    BROWSER_OPTIONS: {23        headless: true,24        args: [25            '--no-sandbox',26            '--disable-setuid-sandbox',27            '--disable-dev-shm-usage',28            '--disable-accelerated-2d-canvas',29            '--no-first-run',30            '--no-zygote',31            '--disable-gpu',32            '--window-size=1366,768',33            '--hide-scrollbars',34            '--disable-notifications',35            '--disable-background-timer-throttling',36            '--disable-backgrounding-occluded-windows',37            '--disable-renderer-backgrounding',38            '--disable-features=TranslateUI',39            '--disable-ipc-flooding-protection'40        ]41    },42    43    // Crawler settings44    CRAWLER_OPTIONS: {45        maxConcurrency: 3,46        navigationTimeoutSecs: 45,47        maxRequestRetries: 248    },49    50    // User agents for rotation51    USER_AGENTS: [52        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',53        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',54        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',55        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',56        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15'57    ],58    59    // Enhanced Shopify detection indicators60    SHOPIFY_INDICATORS: {61        // URL patterns62        urls: [63            'myshopify.com',64            'shopifycdn.com',65            'shopifycs.com',66            'shopify-analytics.com',67            'monorail-edge.shopifysvc.com'68        ],69        70        // HTTP headers71        headers: [72            'x-shopify-stage',73            'x-shopify-shop-api-call-limit',74            'x-shopify-request-id',75            'x-shopify-shop-domain',76            'x-shopify-api-version'77        ],78        79        // JavaScript objects80        javascript: [81            'Shopify',82            'Shopify.theme',83            'Shopify.shop',84            'Shopify.currency',85            'ShopifyAnalytics',86            'Shopify.checkout',87            'Shopify.routes'88        ],89        90        // DOM elements and classes91        dom: [92            'shopify-section',93            'shopify-policy-list',94            'shopify-product-form',95            'shopify-payment-button',96            'shopify-cleanslate',97            'shopify-product-reviews-badge'98        ],99        100        // Meta tags101        meta: [102            'shopify-digital-wallet',103            'shopify-checkout-api-token',104            'shopify-checkout-domain'105        ],106        107        // Script and CSS sources108        assets: [109            'assets/theme.js',110            'assets/option_selection.js',111            'assets/theme.css',112            '/wpm@',113            'cdn.shopify.com',114            'v3.shopifycdn.com'115        ],116        117        // API endpoints118        endpoints: [119            '/products.json',120            '/cart.js',121            '/collections.json',122            '/cart/add',123            '/cart/update',124            '/api/2023-01/',125            '/api/2024-01/'126        ],127        128        // Text patterns in HTML129        textPatterns: [130            'powered by shopify',131            'shopify-checkout',132            'shopify-features',133            'shopify-money-format',134            'var shopifycdn'135        ]136    },137    138    // Enhanced WordPress detection indicators  139    WORDPRESS_INDICATORS: {140        // URL patterns141        urls: [142            'wp-content',143            'wp-includes', 144            'wp-admin',145            'wordpress.com',146            'wp-json'147        ],148        149        // HTTP headers150        headers: [151            'x-pingback',152            'x-powered-by-wordpress'153        ],154        155        // DOM elements and classes156        dom: [157            'wp-block',158            'wp-container',159            'wp-site-blocks',160            'wp-block-group',161            'wp-block-column',162            'wp-block-paragraph',163            'wp-embed',164            'wp-caption',165            'wp-image'166        ],167        168        // Meta tags169        meta: [170            'generator',171            'wlwmanifest'172        ],173        174        // Script and CSS sources175        assets: [176            'wp-content/themes',177            'wp-content/plugins',178            'wp-includes/js',179            'wp-includes/css',180            'wp-emoji-release.min.js'181        ],182        183        // API endpoints184        endpoints: [185            '/wp-json/',186            '/wp-json/wp/v2/',187            '/xmlrpc.php',188            '/?rest_route=/',189            '/wp-admin/admin-ajax.php'190        ],191        192        // Text patterns in HTML193        textPatterns: [194            'powered by wordpress',195            'wp-json',196            'wp_enqueue_script',197            'wp_head',198            'wordpress'199        ]200    },201    202    // Email marketing tools detection203    EMAIL_TOOLS: {204        klaviyo: [205            'klaviyo',206            'klaviyo.com',207            'static.klaviyo.com',208            'kla.js'209        ],210        mailchimp: [211            'mailchimp',212            'mc.us',213            'chimpstatic.com',214            'mc4wp',215            'mailchimp-woocommerce'216        ],217        brevo: [218            'brevo',219            'sendinblue',220            'sibforms',221            'mailin.fr',222            'sib-api'223        ],224        hubspot: [225            'hubspot',226            'hs-scripts.com',227            'hsforms.net',228            'hubapi.com'229        ],230        salesforce: [231            'salesforce',232            'pardot',233            'exacttarget',234            'marketingcloud'235        ],236        convertkit: [237            'convertkit',238            'ck.page',239            'convertkit-mail'240        ],241        aweber: [242            'aweber',243            'aweber.com',244            'forms.aweber.com'245        ],246        getresponse: [247            'getresponse',248            'getresponse.com',249            'gr-cdn.com'250        ],251        activecampaign: [252            'activecampaign',253            'activehosted.com',254            'trackcmp.net'255        ],256        constantcontact: [257            'constantcontact',258            'ctctcdn.com',259            'constantcontacts.com'260        ],261        omnisend: [262            'omnisend',263            'omnisrc.com',264            'omnisend.com'265        ],266        privy: [267            'privy',268            'privy.com',269            'widget.privy.com'270        ],271        yotpo: [272            'yotpo',273            'staticw2.yotpo.com',274            'yotpo.com'275        ],276        attentive: [277            'attentive',278            'attentivemobile.com',279            'attn.tv'280        ]281    },282    283    // Fallback URLs for testing284    FALLBACK_URLS: [285        'https://www.shopify.com',286        'https://wordpress.org',287        'https://example.com'288    ],289    290    // Delays and timeouts291    DELAYS: {292        betweenRequests: 1000,293        pageLoad: 3000,294        networkIdle: 5000,295        apiTest: 10000296    },297    298    // Detection weights (for scoring system)299    DETECTION_WEIGHTS: {300        url: 10,301        headers: 9,302        javascript: 8,303        meta: 7,304        dom: 6,305        assets: 5,306        endpoints: 8,307        textPatterns: 4308    }309};310            '1/**2 * Google search functionality3 */4import { log } from 'crawlee';5import { PuppeteerCrawler } from 'crawlee';6import { getRandomUserAgent, normalizeUrl, removeDuplicates, sleep } from './utils.js';7
8/**9 * Search Google for URLs based on keyword10 */11export async function searchGoogleForUrls(keyword, country, config) {12    log.info(`🔍 Starting Google search for: "${keyword}" in ${country}`);13    14    const googleDomain = config.GOOGLE_DOMAINS[country] || config.GOOGLE_DOMAINS['fr'];15    const maxPages = config.MAX_PAGES_PER_KEYWORD || 3;16    17    let allUrls = [];18    19    // Create a crawler for Google search20    const searchCrawler = new PuppeteerCrawler({21        launchContext: {22            launchOptions: config.BROWSER_OPTIONS,23        },24        maxConcurrency: 1, // Keep it low to avoid being blocked25        navigationTimeoutSecs: 30,26        maxRequestRetries: 2,27        28        async requestHandler({ page, request }) {29            try {30                const pageNumber = request.userData.pageNumber || 1;31                log.info(`📄 Searching Google page ${pageNumber} for "${keyword}"`);32                33                // Set user agent34                const userAgent = getRandomUserAgent(config.USER_AGENTS);35                await page.setUserAgent(userAgent);36                await page.setViewport({ width: 1366, height: 768 });37                38                // Add some randomness to avoid detection39                await page.evaluateOnNewDocument(() => {40                    // Override the navigator.webdriver property41                    Object.defineProperty(navigator, 'webdriver', {42                        get: () => undefined,43                    });44                    45                    // Override the chrome property46                    window.chrome = {47                        runtime: {},48                        loadTimes: function() {},49                        csi: function() {},50                        app: {}51                    };52                    53                    // Override plugins54                    Object.defineProperty(navigator, 'plugins', {55                        get: () => [1, 2, 3, 4, 5],56                    });57                    58                    // Override languages59                    Object.defineProperty(navigator, 'languages', {60                        get: () => ['fr-FR', 'fr', 'en'],61                    });62                });63                64                // Navigate to Google search65                const searchUrl = request.url;66                log.info(`🌐 Navigating to: ${searchUrl}`);67                68                await page.goto(searchUrl, {69                    waitUntil: 'networkidle2',70                    timeout: 3000071                });72                73                // Wait for search results to load74                await page.waitForTimeout(2000);75                76                // Handle cookie consent if present77                try {78                    const acceptButton = await page.$('button[id*="accept"], button[id*="consent"], #L2AGLb');79                    if (acceptButton) {80                        await acceptButton.click();81                        await page.waitForTimeout(1000);82                    }83                } catch (e) {84                    log.info('No cookie consent found or already handled');85                }86                87                // Extract search result URLs88                const urls = await page.evaluate(() => {89                    const results = [];90                    91                    // Multiple selectors for different Google layouts92                    const selectors = [93                        'a[href^="http"]:not([href*="google."])',94                        'div.g a[href^="http"]',95                        'div[data-ved] a[href^="http"]',96                        '.rc a[href^="http"]',97                        'div.yuRUbf a[href^="http"]'98                    ];99                    100                    selectors.forEach(selector => {101                        const elements = document.querySelectorAll(selector);102                        elements.forEach(element => {103                            const href = element.href;104                            if (href && 105                                !href.includes('google.') && 106                                !href.includes('youtube.com') &&107                                !href.includes('facebook.com') &&108                                !href.includes('instagram.com') &&109                                !href.includes('twitter.com') &&110                                !href.includes('linkedin.com') &&111                                !href.includes('wikipedia.org') &&112                                !href.includes('webcache.googleusercontent.com')) {113                                results.push(href);114                            }115                        });116                    });117                    118                    return results;119                });120                121                log.info(`Found ${urls.length} URLs on page ${pageNumber}`);122                123                // Add URLs to the collection124                allUrls.push(...urls);125                126            } catch (error) {127                log.error(`Error searching Google page: ${error.message}`);128                throw error;129            }130        },131        132        failedRequestHandler({ request, error }) {133            log.error(`Google search request failed: ${error.message}`);134        }135    });136    137    // Generate search URLs for each page138    const searchUrls = [];139    for (let page = 1; page <= maxPages; page++) {140        const start = (page - 1) * 10;141        const searchUrl = `https://${googleDomain}/search?q=${encodeURIComponent(keyword)}&start=${start}&num=10&hl=fr&gl=${country.toLowerCase()}`;142        143        searchUrls.push({144            url: searchUrl,145            userData: { pageNumber: page }146        });147    }148    149    // Add search requests to crawler150    await searchCrawler.addRequests(searchUrls);151    152    // Run the search crawler153    await searchCrawler.run();154    155    // Clean and deduplicate URLs156    let cleanedUrls = allUrls157        .map(url => {158            try {159                // Clean Google redirect URLs160                if (url.includes('google.') && url.includes('url?q=')) {161                    const urlParams = new URLSearchParams(url.split('?')[1]);162                    return urlParams.get('q') || url;163                }164                return url;165            } catch (e) {166                return url;167            }168        })169        .filter(url => {170            try {171                new URL(url);172                return true;173            } catch (e) {174                return false;175            }176        })177        .map(url => normalizeUrl(url));178    179    // Remove duplicates180    cleanedUrls = removeDuplicates(cleanedUrls);181    182    log.info(`✅ Google search completed. Found ${cleanedUrls.length} unique URLs for "${keyword}"`);183    184    // Log some sample URLs for debugging185    if (cleanedUrls.length > 0) {186        log.info(`Sample URLs found: ${cleanedUrls.slice(0, 3).join(', ')}`);187    }188    189    return cleanedUrls;190}191
192/**193 * Alternative Google search using direct HTTP requests (fallback)194 */195export async function searchGoogleAlternative(keyword, country, config) {196    log.info(`🔄 Using alternative Google search for: "${keyword}"`);197    198    // This is a simplified version that could be implemented199    // using HTTP requests instead of browser automation200    // For now, return empty array to trigger fallback URLs201    202    return [];203}204
205/**206 * Extract and clean URLs from Google search results HTML207 */208function extractUrlsFromHtml(html) {209    const urls = [];210    211    // Regular expressions to match URLs in Google search results212    const urlPatterns = [213        /href="(https?:\/\/[^"]+)"/g,214        /data-href="(https?:\/\/[^"]+)"/g215    ];216    217    urlPatterns.forEach(pattern => {218        let match;219        while ((match = pattern.exec(html)) !== null) {220            const url = match[1];221            222            // Filter out Google URLs and other unwanted domains223            if (!url.includes('google.') && 224                !url.includes('youtube.com') &&225                !url.includes('facebook.com') &&226                !url.includes('instagram.com') &&227                !url.includes('twitter.com') &&228                !url.includes('linkedin.com') &&229                !url.includes('wikipedia.org')) {230                urls.push(url);231            }232        }233    });234    235    return urls;236}237
238/**239 * Generate search query with variations240 */241export function generateSearchQueries(baseKeyword) {242    const queries = [baseKeyword];243    244    // Add some variations to get more diverse results245    queries.push(`"${baseKeyword}" site:*.com`);246    queries.push(`${baseKeyword} boutique`);247    queries.push(`${baseKeyword} shop`);248    queries.push(`${baseKeyword} store`);249    250    return queries;251}252
253/**254 * Validate and filter URLs255 */256function isValidSearchResult(url) {257    try {258        const urlObj = new URL(url);259        const hostname = urlObj.hostname.toLowerCase();260        261        // Exclude certain domains262        const excludedDomains = [263            'google.',264            'youtube.com',265            'facebook.com',266            'instagram.com',267            'twitter.com',268            'linkedin.com',269            'wikipedia.org',270            'amazon.',271            'ebay.',272            'aliexpress.com',273            'alibaba.com'274        ];275        276        return !excludedDomains.some(domain => hostname.includes(domain));277    } catch (e) {278        return false;279    }280}1/**2 * Apify Actor for analyzing Shopify and WordPress usage from Google search results3 * Enhanced version with improved Shopify detection4 */5import { Actor } from 'apify';6import { PuppeteerCrawler, log } from 'crawlee';7import { CONFIG } from './src/config.js';8import { 9    getRandomUserAgent, 10    initializeResults, 11    formatKeywordResults 12} from './src/utils.js';13import { searchGoogleForUrls } from './src/googleSearch.js';14import { 15    detectTechnologies, 16    detectEmailTools, 17    preparePage, 18    navigateToUrl 19} from './src/siteAnalyzer.js';20
21// Main Actor function22await Actor.main(async () => {23    try {24        log.info('Starting the enhanced Shopify/WordPress analyzer');25        26        // Check if we're in test mode27        const isInTestMode = process.env.APIFY_IS_AT_HOME;28        29        // Get and process input parameters30        const input = await Actor.getInput() || {};31        const { 32            keywords = ['test'],33            country = CONFIG.DEFAULT_COUNTRY, 34            maxPages = isInTestMode ? 1 : CONFIG.MAX_PAGES_PER_KEYWORD,35            maxUrlsToAnalyze = isInTestMode ? 3 : CONFIG.MAX_URLS_TO_ANALYZE36        } = input;37        38        // Validate input39        if (!keywords || !Array.isArray(keywords) || keywords.length === 0) {40            throw new Error('At least one keyword is required');41        }42        43        log.info(`Analyzing ${keywords.length} keywords with up to ${maxUrlsToAnalyze} URLs per keyword`);44        45        // Results container for all keywords46        const allResults = {};47        48        // Process each keyword49        for (const keyword of keywords) {50            // Skip empty keywords51            if (!keyword || keyword.trim() === '') {52                log.warning('Empty keyword detected, skipping');53                continue;54            }55            56            log.info(`\n========== ANALYZING KEYWORD: ${keyword} ==========`);57            58            // Step 1: Search Google and get URLs59            let urlsToAnalyze = [];60            try {61                const searchResults = await searchGoogleForUrls(keyword, country, CONFIG);62                63                if (searchResults.length > 0) {64                    // Take only the specified number of URLs65                    urlsToAnalyze = searchResults.slice(0, maxUrlsToAnalyze);66                    log.info(`Found ${urlsToAnalyze.length} URLs to analyze for "${keyword}"`);67                } else {68                    log.warning(`No URLs found for "${keyword}", using fallback URLs`);69                    urlsToAnalyze = [...CONFIG.FALLBACK_URLS];70                }71            } catch (searchError) {72                log.error(`Search failed for "${keyword}": ${searchError.message}`);73                urlsToAnalyze = [...CONFIG.FALLBACK_URLS];74                log.info(`Using ${urlsToAnalyze.length} fallback URLs`);75            }76            77            // Skip if no URLs were found78            if (urlsToAnalyze.length === 0) {79                log.info(`No URLs to analyze for "${keyword}", skipping`);80                allResults[keyword] = {81                    message: "No URLs found for this search",82                    timestamp: new Date().toISOString()83                };84                continue;85            }86            87            // Initialize results for this keyword88            const results = initializeResults();89            90            // Set up proxy configuration for the crawler91            const proxyConfig = isInTestMode ? 92                undefined : 93                await Actor.createProxyConfiguration({94                    useApifyProxy: true,95                    apifyProxyGroups: ['RESIDENTIAL']96                });97            98            // Step 2: Set up crawler to analyze each URL99            const crawler = new PuppeteerCrawler({100                proxyConfiguration: proxyConfig,101                launchContext: {102                    launchOptions: CONFIG.BROWSER_OPTIONS,103                },104                maxConcurrency: CONFIG.CRAWLER_OPTIONS.maxConcurrency,105                navigationTimeoutSecs: CONFIG.CRAWLER_OPTIONS.navigationTimeoutSecs,106                maxRequestRetries: CONFIG.CRAWLER_OPTIONS.maxRequestRetries,107                108                // Handle each URL109                async requestHandler({ page, request }) {110                    const url = request.url;111                    log.info(`Analyzing: ${url}`);112                    113                    try {114                        // Prepare the page115                        const userAgent = getRandomUserAgent(CONFIG.USER_AGENTS);116                        await preparePage(page, userAgent);117                        118                        // Navigate to the URL119                        const navigationSuccessful = await navigateToUrl(page, url);120                        121                        if (!navigationSuccessful) {122                            throw new Error("Failed to load page after multiple attempts");123                        }124                        125                        // Enhanced detection with multiple passes126                        log.info(`🔍 Starting enhanced detection for ${url}`);127                        128                        // Detect technologies (Shopify, WordPress) with enhanced methods129                        const siteInfo = await detectTechnologies(page, url, CONFIG);130                        131                        // Detect email marketing tools132                        const emailInfo = await detectEmailTools(page, CONFIG.EMAIL_TOOLS);133                        134                        // Log detection details135                        if (siteInfo.detectionMethod) {136                            log.info(`✅ Detection method: ${siteInfo.detectionMethod}`);137                        }138                        139                        // Categorize the site140                        if (siteInfo.isShopify) {141                            results.shopify.push(url);142                            143                            // Check which email tools are used144                            for (const tool in emailInfo) {145                                if (emailInfo[tool]) {146                                    const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);147                                    results[`shopifyWith${capitalizedTool}`].push(url);148                                }149                            }150                            151                            log.info(`✅ ${url}: Shopify detected (${siteInfo.detectionMethod || 'standard'})`);152                        } else if (siteInfo.isWordPress) {153                            results.wordpress.push(url);154                            155                            // Check which email tools are used156                            for (const tool in emailInfo) {157                                if (emailInfo[tool]) {158                                    const capitalizedTool = tool.charAt(0).toUpperCase() + tool.slice(1);159                                    results[`wordpressWith${capitalizedTool}`].push(url);160                                }161                            }162                            163                            log.info(`✅ ${url}: WordPress detected (${siteInfo.detectionMethod || 'standard'})`);164                        } else {165                            results.autres.push(url);166                            log.info(`✅ ${url}: Other platform or undetected`);167                        }168                        169                    } catch (error) {170                        log.error(`❌ Error analyzing ${url}: ${error.message}`);171                        172                        // Enhanced URL-based fallback detection173                        const urlLower = url.toLowerCase();174                        if (urlLower.includes('shopify') || 175                            urlLower.includes('myshopify') ||176                            urlLower.includes('shopifycdn') ||177                            urlLower.includes('shopifycs')) {178                            results.shopify.push(url);179                            log.info(`✅ ${url}: Shopify (detected via URL fallback)`);180                        } else if (urlLower.includes('wordpress') || 181                                   urlLower.includes('wp-') ||182                                   urlLower.includes('wp-content') ||183                                   urlLower.includes('wp-includes')) {184                            results.wordpress.push(url);185                            log.info(`✅ ${url}: WordPress (detected via URL fallback)`);186                        } else {187                            results.autres.push(url);188                        }189                    }190                },191                192                // Handle failures193                failedRequestHandler({ request, error }) {194                    log.error(`Request ${request.url} failed: ${error.message}`);195                }196            });197            198            // Add URLs to the crawler199            for (const url of urlsToAnalyze) {200                await crawler.addRequests([{ url }]);201            }202            203            // Run the crawler for this keyword204            await crawler.run();205            206            // Format results and store them207            allResults[keyword] = formatKeywordResults(results);208            209            log.info(`Analysis complete for "${keyword}". Sites analyzed: ${urlsToAnalyze.length}`);210            log.info(`Results: Shopify: ${results.shopify.length}, WordPress: ${results.wordpress.length}, Other: ${results.autres.length}`);211        }212        213        // Save final results to dataset214        await Actor.pushData(allResults);215        216        log.info(`\n========== ANALYSIS COMPLETE ==========`);217        log.info(`Keywords analyzed: ${keywords.length}`);218        219        // Log summary statistics220        let totalShopify = 0, totalWordPress = 0, totalOther = 0;221        for (const keyword in allResults) {222            if (allResults[keyword].shopify) {223                totalShopify += allResults[keyword].shopify.length;224                totalWordPress += allResults[keyword].wordpress.length;225                totalOther += allResults[keyword].autres.length;226            }227        }228        229        log.info(`FINAL STATS - Shopify: ${totalShopify}, WordPress: ${totalWordPress}, Other: ${totalOther}`);230        231    } catch (error) {232        log.error(`Main error: ${error.message}`);233        await Actor.pushData({234            error: error.message,235            stackTrace: error.stack,236            timestamp: new Date().toISOString()237        });238    }239});1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6    log.info(`enqueueing new URLs`);7    await enqueueLinks({8        globs: ['https://apify.com/*'],9        label: 'detail',10    });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14    const title = await page.title();15    log.info(`${title}`, { url: request.loadedUrl });16
17    await Dataset.pushData({18        url: request.loadedUrl,19        title,20    });21});1/**2 * Enhanced site analyzer with improved Shopify and WordPress detection3 */4import { log } from 'crawlee';5
6/**7 * Prepare page with user agent and other settings8 */9export async function preparePage(page, userAgent) {10    try {11        await page.setUserAgent(userAgent);12        await page.setViewport({ width: 1366, height: 768 });13        14        // Block images and fonts to speed up loading15        await page.setRequestInterception(true);16        page.on('request', (req) => {17            const resourceType = req.resourceType();18            if (resourceType === 'image' || resourceType === 'font' || resourceType === 'media') {19                req.abort();20            } else {21                req.continue();22            }23        });24        25        // Set extra headers26        await page.setExtraHTTPHeaders({27            'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',28            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'29        });30        31    } catch (error) {32        log.warning(`Failed to prepare page: ${error.message}`);33    }34}35
36/**37 * Navigate to URL with retries38 */39export async function navigateToUrl(page, url, maxRetries = 3) {40    for (let attempt = 1; attempt <= maxRetries; attempt++) {41        try {42            log.info(`Navigation attempt ${attempt}/${maxRetries} for ${url}`);43            44            const response = await page.goto(url, {45                waitUntil: 'networkidle2',46                timeout: 3000047            });48            49            if (response && response.status() < 400) {50                return true;51            }52            53            log.warning(`HTTP ${response?.status()} for ${url}, attempt ${attempt}`);54            55        } catch (error) {56            log.warning(`Navigation attempt ${attempt} failed for ${url}: ${error.message}`);57            58            if (attempt === maxRetries) {59                return false;60            }61            62            // Wait before retry63            await new Promise(resolve => setTimeout(resolve, 2000 * attempt));64        }65    }66    67    return false;68}69
70/**71 * Enhanced technology detection with multiple detection methods72 */73export async function detectTechnologies(page, url, config) {74    const result = {75        isShopify: false,76        isWordPress: false,77        detectionMethod: null78    };79    80    try {81        // Method 1: Quick URL-based detection82        const urlLower = url.toLowerCase();83        if (urlLower.includes('myshopify.com') || 84            urlLower.includes('shopifycdn.com') ||85            urlLower.includes('shopifycs.com')) {86            result.isShopify = true;87            result.detectionMethod = 'URL';88            return result;89        }90        91        if (urlLower.includes('wordpress.com') || 92            urlLower.includes('wp-content') ||93            urlLower.includes('wp-includes')) {94            result.isWordPress = true;95            result.detectionMethod = 'URL';96            return result;97        }98        99        // Method 2: HTTP Headers detection100        const response = await page.goto(url, { waitUntil: 'networkidle2' });101        const headers = response?.headers() || {};102        103        // Check Shopify-specific headers104        if (headers['x-shopify-stage'] || 105            headers['x-shopify-shop-api-call-limit'] ||106            headers['x-shopify-request-id'] ||107            (headers['server'] && headers['server'].includes('nginx/1.14.2'))) {108            result.isShopify = true;109            result.detectionMethod = 'HTTP Headers';110            return result;111        }112        113        // Check WordPress headers114        if (headers['x-pingback'] || 115            (headers['link'] && headers['link'].includes('wp-json'))) {116            result.isWordPress = true;117            result.detectionMethod = 'HTTP Headers';118            return result;119        }120        121        // Method 3: Wait for page to fully load122        await page.waitForTimeout(3000);123        await page.waitForLoadState?.('networkidle') || Promise.resolve();124        125        // Method 4: Enhanced DOM-based detection126        const detectionResult = await page.evaluate(() => {127            const shopifyIndicators = [128                // JavaScript objects129                'Shopify',130                'Shopify.theme',131                'Shopify.shop',132                'Shopify.currency',133                'ShopifyAnalytics',134                'window.Shopify',135                136                // Meta tags137                'shopify-digital-wallet',138                'shopify-checkout-api-token',139                'theme-color',140                141                // CSS classes and IDs142                'shopify-section',143                'shopify-policy-list',144                'shopify-product-form',145                146                // Script sources147                'shopifycdn.com',148                'assets/theme.js',149                'assets/option_selection.js',150                '/wpm@',151                'monorail-edge.shopifysvc.com'152            ];153            154            const wordpressIndicators = [155                'wp-content',156                'wp-includes',157                'wp-admin',158                'wordpress',159                '/wp-json/',160                'wp-embed',161                'wlwmanifest',162                'xmlrpc.php'163            ];164            165            let shopifyDetected = false;166            let wordpressDetected = false;167            let method = 'DOM';168            169            // Check JavaScript objects170            if (typeof window.Shopify !== 'undefined' ||171                typeof window.ShopifyAnalytics !== 'undefined') {172                shopifyDetected = true;173                method = 'JavaScript objects';174            }175            176            // Check document content177            const htmlContent = document.documentElement.outerHTML.toLowerCase();178            179            // Shopify detection in HTML180            for (const indicator of shopifyIndicators) {181                if (htmlContent.includes(indicator.toLowerCase())) {182                    shopifyDetected = true;183                    if (method === 'DOM') method = `DOM (${indicator})`;184                    break;185                }186            }187            188            // WordPress detection in HTML189            if (!shopifyDetected) {190                for (const indicator of wordpressIndicators) {191                    if (htmlContent.includes(indicator.toLowerCase())) {192                        wordpressDetected = true;193                        if (method === 'DOM') method = `DOM (${indicator})`;194                        break;195                    }196                }197            }198            199            // Check meta generators200            const generators = document.querySelectorAll('meta[name="generator"]');201            generators.forEach(gen => {202                const content = gen.getAttribute('content')?.toLowerCase() || '';203                if (content.includes('shopify')) {204                    shopifyDetected = true;205                    method = 'Meta generator';206                } else if (content.includes('wordpress')) {207                    wordpressDetected = true;208                    method = 'Meta generator';209                }210            });211            212            // Check for specific CSS files213            const links = document.querySelectorAll('link[rel="stylesheet"]');214            links.forEach(link => {215                const href = link.getAttribute('href')?.toLowerCase() || '';216                if (href.includes('shopifycdn.com') || href.includes('assets/theme.css')) {217                    shopifyDetected = true;218                    method = 'CSS links';219                } else if (href.includes('wp-content') || href.includes('wp-includes')) {220                    wordpressDetected = true;221                    method = 'CSS links';222                }223            });224            225            return {226                shopify: shopifyDetected,227                wordpress: wordpressDetected,228                method: method229            };230        });231        232        if (detectionResult.shopify) {233            result.isShopify = true;234            result.detectionMethod = detectionResult.method;235            return result;236        }237        238        if (detectionResult.wordpress) {239            result.isWordPress = true;240            result.detectionMethod = detectionResult.method;241            return result;242        }243        244        // Method 5: Network requests analysis245        log.info(`Checking network requests for ${url}`);246        const networkDetection = await checkNetworkRequests(page, url);247        if (networkDetection.isShopify) {248            result.isShopify = true;249            result.detectionMethod = 'Network requests';250            return result;251        }252        253        if (networkDetection.isWordPress) {254            result.isWordPress = true;255            result.detectionMethod = 'Network requests';256            return result;257        }258        259        // Method 6: API endpoint testing260        const apiDetection = await testApiEndpoints(page, url);261        if (apiDetection.isShopify) {262            result.isShopify = true;263            result.detectionMethod = 'API endpoints';264            return result;265        }266        267        if (apiDetection.isWordPress) {268            result.isWordPress = true;269            result.detectionMethod = 'API endpoints';270            return result;271        }272        273    } catch (error) {274        log.error(`Detection error for ${url}: ${error.message}`);275    }276    277    return result;278}279
280/**281 * Check network requests for platform indicators282 */283async function checkNetworkRequests(page, url) {284    return new Promise((resolve) => {285        let shopifyDetected = false;286        let wordpressDetected = false;287        288        const timeout = setTimeout(() => {289            resolve({ isShopify: shopifyDetected, isWordPress: wordpressDetected });290        }, 5000);291        292        const requestHandler = (request) => {293            const requestUrl = request.url().toLowerCase();294            295            if (requestUrl.includes('shopifycdn.com') ||296                requestUrl.includes('shopifycs.com') ||297                requestUrl.includes('monorail-edge.shopifysvc.com') ||298                requestUrl.includes('/cart/add') ||299                requestUrl.includes('/products.json')) {300                shopifyDetected = true;301                clearTimeout(timeout);302                page.off('request', requestHandler);303                resolve({ isShopify: true, isWordPress: false });304            } else if (requestUrl.includes('wp-content') ||305                       requestUrl.includes('wp-includes') ||306                       requestUrl.includes('wp-admin') ||307                       requestUrl.includes('wp-json')) {308                wordpressDetected = true;309                clearTimeout(timeout);310                page.off('request', requestHandler);311                resolve({ isShopify: false, isWordPress: true });312            }313        };314        315        page.on('request', requestHandler);316        317        // Trigger some page interactions to generate requests318        page.evaluate(() => {319            // Scroll to trigger lazy loading320            window.scrollTo(0, document.body.scrollHeight / 2);321            322            // Try to trigger cart-related requests (Shopify)323            if (window.fetch) {324                fetch('/cart.js').catch(() => {});325                fetch('/products.json?limit=1').catch(() => {});326            }327        }).catch(() => {});328    });329}330
331/**332 * Test common API endpoints333 */334async function testApiEndpoints(page, baseUrl) {335    const result = { isShopify: false, isWordPress: false };336    337    // Shopify endpoints to test338    const shopifyEndpoints = [339        '/products.json',340        '/cart.js',341        '/collections.json',342        '/api/2023-01/products.json'343    ];344    345    // WordPress endpoints to test346    const wordpressEndpoints = [347        '/wp-json/',348        '/wp-json/wp/v2/',349        '/xmlrpc.php',350        '/?rest_route=/'351    ];352    353    try {354        // Test Shopify endpoints355        for (const endpoint of shopifyEndpoints) {356            try {357                const testUrl = new URL(endpoint, baseUrl).href;358                const response = await page.goto(testUrl, { 359                    waitUntil: 'networkidle2', 360                    timeout: 10000 361                });362                363                if (response && response.status() === 200) {364                    const contentType = response.headers()['content-type'] || '';365                    if (contentType.includes('application/json')) {366                        const text = await response.text();367                        if (text.includes('products') || text.includes('handle') || text.includes('variants')) {368                            result.isShopify = true;369                            return result;370                        }371                    }372                }373            } catch (e) {374                // Endpoint not available, continue375            }376        }377        378        // Test WordPress endpoints if Shopify not detected379        if (!result.isShopify) {380            for (const endpoint of wordpressEndpoints) {381                try {382                    const testUrl = new URL(endpoint, baseUrl).href;383                    const response = await page.goto(testUrl, { 384                        waitUntil: 'networkidle2', 385                        timeout: 10000 386                    });387                    388                    if (response && response.status() === 200) {389                        const text = await response.text();390                        if (text.includes('wp:') || 391                            text.includes('wordpress') ||392                            text.includes('wp-json') ||393                            text.includes('methodName')) {394                            result.isWordPress = true;395                            return result;396                        }397                    }398                } catch (e) {399                    // Endpoint not available, continue400                }401            }402        }403    } catch (error) {404        log.warning(`API endpoint testing failed: ${error.message}`);405    }406    407    return result;408}409
410/**411 * Detect email marketing tools412 */413export async function detectEmailTools(page, emailTools) {414    const results = {};415    416    // Initialize all tools as false417    for (const tool of Object.keys(emailTools)) {418        results[tool] = false;419    }420    421    try {422        const detection = await page.evaluate((toolsConfig) => {423            const detected = {};424            const htmlContent = document.documentElement.outerHTML.toLowerCase();425            426            for (const [toolName, indicators] of Object.entries(toolsConfig)) {427                detected[toolName] = false;428                429                for (const indicator of indicators) {430                    if (htmlContent.includes(indicator.toLowerCase())) {431                        detected[toolName] = true;432                        break;433                    }434                }435            }436            437            return detected;438        }, emailTools);439        440        Object.assign(results, detection);441        442    } catch (error) {443        log.warning(`Email tools detection failed: ${error.message}`);444    }445    446    return results;447}1/**2 * Utility functions for the crawler3 */4
5/**6 * Get random user agent from the list7 */8export function getRandomUserAgent(userAgents) {9    return userAgents[Math.floor(Math.random() * userAgents.length)];10}11
12/**13 * Initialize results structure for a keyword14 */15export function initializeResults() {16    return {17        shopify: [],18        wordpress: [],19        autres: [],20        // Shopify with email tools21        shopifyWithKlaviyo: [],22        shopifyWithMailchimp: [],23        shopifyWithBrevo: [],24        shopifyWithHubspot: [],25        shopifyWithSalesforce: [],26        shopifyWithConvertkit: [],27        shopifyWithAweber: [],28        shopifyWithGetresponse: [],29        shopifyWithActivecampaign: [],30        shopifyWithConstantcontact: [],31        shopifyWithOmnisend: [],32        shopifyWithPrivy: [],33        shopifyWithYotpo: [],34        shopifyWithAttentive: [],35        // WordPress with email tools36        wordpressWithKlaviyo: [],37        wordpressWithMailchimp: [],38        wordpressWithBrevo: [],39        wordpressWithHubspot: [],40        wordpressWithSalesforce: [],41        wordpressWithConvertkit: [],42        wordpressWithAweber: [],43        wordpressWithGetresponse: [],44        wordpressWithActivecampaign: [],45        wordpressWithConstantcontact: [],46        wordpressWithOmnisend: [],47        wordpressWithPrivy: [],48        wordpressWithYotpo: [],49        wordpressWithAttentive: []50    };51}52
53/**54 * Format results for output55 */56export function formatKeywordResults(results) {57    const formatted = {58        timestamp: new Date().toISOString(),59        summary: {60            totalSites: results.shopify.length + results.wordpress.length + results.autres.length,61            shopifyCount: results.shopify.length,62            wordpressCount: results.wordpress.length,63            otherCount: results.autres.length64        },65        shopify: results.shopify,66        wordpress: results.wordpress,67        autres: results.autres68    };69    70    // Add email tool combinations71    const emailTools = [72        'Klaviyo', 'Mailchimp', 'Brevo', 'Hubspot', 'Salesforce',73        'Convertkit', 'Aweber', 'Getresponse', 'Activecampaign',74        'Constantcontact', 'Omnisend', 'Privy', 'Yotpo', 'Attentive'75    ];76    77    emailTools.forEach(tool => {78        const shopifyKey = `shopifyWith${tool}`;79        const wordpressKey = `wordpressWith${tool}`;80        81        if (results[shopifyKey].length > 0) {82            formatted[`shopify_avec_${tool.toLowerCase()}`] = results[shopifyKey];83        }84        85        if (results[wordpressKey].length > 0) {86            formatted[`wordpress_avec_${tool.toLowerCase()}`] = results[wordpressKey];87        }88    });89    90    return formatted;91}92
93/**94 * Sleep function for delays95 */96export function sleep(ms) {97    return new Promise(resolve => setTimeout(resolve, ms));98}99
100/**101 * Normalize URL (remove trailing slash, www, etc.)102 */103export function normalizeUrl(url) {104    try {105        const urlObj = new URL(url);106        let hostname = urlObj.hostname.toLowerCase();107        108        // Remove www.109        if (hostname.startsWith('www.')) {110            hostname = hostname.substring(4);111        }112        113        // Remove trailing slash from pathname114        let pathname = urlObj.pathname;115        if (pathname.endsWith('/') && pathname.length > 1) {116            pathname = pathname.slice(0, -1);117        }118        119        return `${urlObj.protocol}//${hostname}${pathname}`;120    } catch (error) {121        return url;122    }123}124
125/**126 * Remove duplicates from array127 */128export function removeDuplicates(array) {129    return [...new Set(array)];130}131
132/**133 * Extract domain from URL134 */135export function extractDomain(url) {136    try {137        const urlObj = new URL(url);138        return urlObj.hostname.toLowerCase().replace('www.', '');139    } catch (error) {140        return null;141    }142}143
144/**145 * Check if URL is valid146 */147export function isValidUrl(string) {148    try {149        new URL(string);150        return true;151    } catch (_) {152        return false;153    }154}155
156/**157 * Generate random delay between min and max milliseconds158 */159export function randomDelay(min = 1000, max = 3000) {160    return Math.floor(Math.random() * (max - min + 1)) + min;161}162
163/**164 * Retry function with exponential backoff165 */166export async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {167    for (let attempt = 1; attempt <= maxRetries; attempt++) {168        try {169            return await fn();170        } catch (error) {171            if (attempt === maxRetries) {172                throw error;173            }174            175            const delay = baseDelay * Math.pow(2, attempt - 1);176            await sleep(delay);177        }178    }179}180
181/**182 * Clean and validate keyword183 */184export function cleanKeyword(keyword) {185    if (!keyword || typeof keyword !== 'string') {186        return null;187    }188    189    return keyword.trim().toLowerCase();190}191
192/**193 * Generate safe filename from keyword194 */195export function generateSafeFilename(keyword) {196    return keyword197        .replace(/[^a-zA-Z0-9]/g, '_')198        .replace(/_+/g, '_')199        .replace(/^_|_$/g, '')200        .toLowerCase();201}202
203/**204 * Calculate detection confidence score205 */206export function calculateConfidenceScore(detectionMethods, weights) {207    let totalScore = 0;208    let maxPossibleScore = 0;209    210    for (const [method, detected] of Object.entries(detectionMethods)) {211        const weight = weights[method] || 1;212        maxPossibleScore += weight;213        214        if (detected) {215            totalScore += weight;216        }217    }218    219    return maxPossibleScore > 0 ? (totalScore / maxPossibleScore) * 100 : 0;220}221
222/**223 * Log statistics224 */225export function logStatistics(results, keyword) {226    const stats = {227        keyword,228        timestamp: new Date().toISOString(),229        totalAnalyzed: results.shopify.length + results.wordpress.length + results.autres.length,230        shopify: results.shopify.length,231        wordpress: results.wordpress.length,232        other: results.autres.length,233        shopifyPercentage: 0,234        wordpressPercentage: 0235    };236    237    if (stats.totalAnalyzed > 0) {238        stats.shopifyPercentage = Math.round((stats.shopify / stats.totalAnalyzed) * 100);239        stats.wordpressPercentage = Math.round((stats.wordpress / stats.totalAnalyzed) * 100);240    }241    242    console.log(`📊 Statistics for "${keyword}":`, stats);243    return stats;244}# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.gitroot = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf{    "extends": "@apify",    "root": true}# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage{    "name": "shopify-wordpress-crawler",    "version": "2.0.0",    "description": "Enhanced Apify Actor for analyzing Shopify and WordPress usage with improved detection",    "main": "main.js",    "type": "module",    "scripts": {        "start": "node main.js",        "test": "echo \"Error: no test specified\" && exit 1"    },    "keywords": [        "apify",        "shopify",        "wordpress",        "crawler",        "web-scraping",        "ecommerce",        "cms-detection"    ],    "author": "Your Name",    "license": "MIT",    "dependencies": {        "apify": "^3.1.0",        "crawlee": "^3.5.0",        "puppeteer": "^21.0.0"    },    "repository": {        "type": "git",        "url": "https://github.com/your-username/shopify-wordpress-crawler"    }}