Email

Try for free

Developed by

TML

0.0 (0)

Pricing

Pay per usage

Last modified

a month ago

E-commerce

Open source

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20 AS builder

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false

# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./

# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build

# Create final image
FROM apify/actor-node-playwright-chrome:20

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Copy built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor-2",
    "title": "Project Playwright Crawler Typescript",
    "description": "Crawlee and Playwright project in typescript.",
    "version": "0.0",
    "meta": {
        "templateId": "ts-crawlee-playwright-chrome"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Email Extractor Crawler Avancé",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "URLs de départ",
            "type": "array",
            "description": "URLs à partir desquelles commencer le crawling. Le robot suivra automatiquement les liens importants (contact, mentions légales, etc.)",
            "editor": "requestListSources",
            "prefill": [
                {
                    "url": "https://www.example.com"
                }
            ]
        },
        "maxRequestsPerCrawl": {
            "title": "Nombre maximum de requêtes",
            "type": "integer",
            "description": "Nombre maximum de pages à visiter au total",
            "default": 30,
            "minimum": 1,
            "maximum": 100
        },
        "maxDepth": {
            "title": "Profondeur maximale",
            "type": "integer", 
            "description": "Profondeur maximale de navigation (0 = page d'accueil uniquement, 1 = + liens directs, 2 = + liens de niveau 2)",
            "default": 2,
            "minimum": 0,
            "maximum": 3
        },
        "maxPagesPerDomain": {
            "title": "Pages max par domaine",
            "type": "integer",
            "description": "Nombre maximum de pages à crawler par domaine",
            "default": 20,
            "minimum": 1,
            "maximum": 50
        },
        "followImportantLinks": {
            "title": "Suivre les liens importants",
            "type": "boolean",
            "description": "Suivre automatiquement les liens vers les pages contact, mentions légales, équipe, etc.",
            "default": true
        },
        "excludePatterns": {
            "title": "Patterns à exclure",
            "type": "array",
            "description": "Extensions ou mots à exclure des emails trouvés",
            "editor": "stringList",
            "default": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"],
            "prefill": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"]
        },
        "targetSelectors": {
            "title": "Sélecteurs CSS ciblés (optionnel)",
            "type": "array",
            "description": "Sélecteurs CSS supplémentaires pour cibler des zones spécifiques (ex: '.company-info', '#team-section')",
            "editor": "stringList",
            "default": []
        },
        "includeGenericEmails": {
            "title": "Inclure les emails génériques",
            "type": "boolean",
            "description": "Inclure les emails comme info@, contact@, support@ (généralement exclus par défaut)",
            "default": false
        },
        "waitForSelector": {
            "title": "Attendre un sélecteur (optionnel)",
            "type": "string",
            "description": "Sélecteur CSS à attendre avant d'extraire les emails (utile pour les sites avec du JavaScript)",
            "editor": "textfield",
            "prefill": ""
        },
        "customUserAgent": {
            "title": "User-Agent personnalisé (optionnel)",
            "type": "string", 
            "description": "User-Agent à utiliser pour les requêtes HTTP",
            "editor": "textfield",
            "prefill": ""
        },
        "respectRobotsTxt": {
            "title": "Respecter robots.txt",
            "type": "boolean",
            "description": "Respecter les directives du fichier robots.txt du site",
            "default": true
        },
        "delayBetweenRequests": {
            "title": "Délai entre requêtes (ms)",
            "type": "integer",
            "description": "Délai en millisecondes entre chaque requête pour éviter la surcharge du serveur",
            "default": 1000,
            "minimum": 0,
            "maximum": 10000
        },
        "exportFormat": {
            "title": "Format d'export",
            "type": "string",
            "description": "Format de sortie des résultats",
            "editor": "select",
            "enum": ["json", "csv", "txt"],
            "enumTitles": ["JSON (détaillé)", "CSV (simple)", "TXT (liste simple)"],
            "default": "json"
        },
        "groupByDomain": {
            "title": "Grouper par domaine",
            "type": "boolean",
            "description": "Organiser les résultats par nom de domaine",
            "default": true
        },
        "verbose": {
            "title": "Mode verbeux",
            "type": "boolean",
            "description": "Afficher plus d'informations de debug dans les logs",
            "default": false
        }
    },
    "required": ["startUrls"],
    "additionalProperties": false
}

src/main.ts

1/**
2 * Crawler avancé pour extraire des adresses emails à partir d'URLs
3 * Utilise Crawlee, Playwright et Chrome headless pour une extraction intelligente.
4 * Cible spécifiquement les footers, pages légales et sections importantes.
5 */
6
7import { Actor } from 'apify';
8import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee';
9
10// Expression régulière pour détecter les emails
11const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
12
13// Sélecteurs pour les zones importantes
14const IMPORTANT_SELECTORS = [
15    'footer', '.footer', '#footer', '[class*="footer"]', '[id*="footer"]',
16    '.contact', '#contact', '[class*="contact"]', '[id*="contact"]',
17    '.legal', '.mentions', '.copyright', '.about', '.team', '.staff',
18    '[class*="legal"]', '[class*="mention"]', '[class*="copyright"]',
19    '[class*="about"]', '[class*="team"]', '[class*="staff"]',
20    '.qui-sommes-nous', '.equipe', '.équipe', '.coordonnees', '.coordonnées'
21];
22
23// Patterns pour identifier les liens importants
24const IMPORTANT_LINK_PATTERNS = [
25    /contact/i, /mention/i, /legal/i, /about/i, /team/i, /staff/i,
26    /qui.*sommes/i, /équipe/i, /equipe/i, /cgv/i, /cgu/i,
27    /politique/i, /confidentialité/i, /confidentialite/i, /privacy/i,
28    /terms/i, /conditions/i, /coordonnees/i, /coordonnées/i
29];
30
31// Définition des types d'entrée
32interface CrawlerInput {
33    startUrls: any[];
34    maxRequestsPerCrawl: number;
35    maxDepth?: number;
36    maxPagesPerDomain?: number;
37    followImportantLinks?: boolean;
38    excludePatterns?: string[];
39}
40
41// Interface pour les liens importants
42interface ImportantLink {
43    url: string;
44    text: string;
45}
46
47// Fonction pour nettoyer et valider les emails
48function validateAndCleanEmails(emails: string[], excludePatterns: string[]): string[] {
49    return emails
50        .map(email => email.trim().toLowerCase())
51        .filter(email => {
52            if (!email) return false;
53            
54            // Exclure les emails avec des extensions d'images
55            const hasImageExtension = excludePatterns.some(pattern => 
56                email.includes(pattern.toLowerCase())
57            );
58            
59            // Vérifier le format
60            const isValidFormat = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/.test(email);
61            
62            // Exclure les emails génériques/exemples
63            const isGeneric = [
64                'example.com', 'test.com', 'domain.com', 'yoursite.com',
65                'website.com', 'noreply@', 'no-reply@', 'placeholder',
66                'dummy', 'fake', 'sample'
67            ].some(generic => email.includes(generic));
68            
69            return !hasImageExtension && isValidFormat && !isGeneric;
70        });
71}
72
73// Fonction pour extraire les domaines des URLs
74function getDomainFromUrl(url: string): string {
75    try {
76        return new URL(url).hostname;
77    } catch {
78        return 'unknown';
79    }
80}
81
82// Création du router avec la logique d'extraction
83const router = createPlaywrightRouter();
84
85router.addDefaultHandler(async ({ page, request, log, enqueueLinks }) => {
86    const url = request.url;
87    const depth = request.userData?.depth || 0;
88    const maxDepth = request.userData?.maxDepth || 1;
89    const domain = getDomainFromUrl(url);
90    
91    log.info(`🔍 Traitement de: ${url} (profondeur: ${depth}, domaine: ${domain})`);
92    
93    try {
94        // Attendre le chargement de la page
95        await page.waitForLoadState('networkidle', { timeout: 10000 });
96        
97        // Récupération des données globales
98        const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
99        const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
100        const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
101        
102        // Ajouter l'URL visitée
103        if (!visitedUrls.includes(url)) {
104            visitedUrls.push(url);
105            await Actor.setValue('visitedUrls', visitedUrls);
106        }
107        
108        // Extraire les emails des zones importantes
109        const emailsFromZones = await page.evaluate((selectors: string[]) => {
110            const emails: string[] = [];
111            const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
112            
113            selectors.forEach(selector => {
114                try {
115                    const elements = document.querySelectorAll(selector);
116                    elements.forEach(element => {
117                        const htmlElement = element as HTMLElement;
118                        const text = element.textContent || htmlElement.innerText || '';
119                        const html = element.innerHTML || '';
120                        
121                        // Extraire emails du texte
122                        const textEmails = text.match(emailRegex) || [];
123                        // Extraire emails du HTML (attributs href, etc.)
124                        const htmlEmails = html.match(emailRegex) || [];
125                        
126                        emails.push(...textEmails, ...htmlEmails);
127                    });
128                } catch (e) {
129                    // Ignorer les erreurs de sélecteur
130                }
131            });
132            
133            return emails;
134        }, IMPORTANT_SELECTORS);
135        
136        // Extraire les emails des liens mailto
137        const mailtoEmails = await page.evaluate(() => {
138            const mailtoLinks = document.querySelectorAll('a[href^="mailto:"]');
139            return Array.from(mailtoLinks).map(link => {
140                const href = link.getAttribute('href');
141                return href ? href.replace('mailto:', '').split('?')[0] : '';
142            }).filter(email => email);
143        });
144        
145        // Extraire les emails du contenu textuel complet (fallback)
146        const allPageText = await page.evaluate(() => {
147            return document.body.innerText || document.body.textContent || '';
148        });
149        const allPageEmails = allPageText.match(EMAIL_REGEX) || [];
150        
151        // Combiner tous les emails trouvés
152        const allEmails = [...new Set([...emailsFromZones, ...mailtoEmails, ...allPageEmails])];
153        
154        // Nettoyer et valider les emails
155        const validEmails = validateAndCleanEmails(allEmails, [
156            'jpg', 'png', 'pdf', 'gif', 'jpeg', 'svg', 'ico', 'webp'
157        ]);
158        
159        // Identifier les nouveaux emails
160        const newEmails = validEmails.filter(email => !foundEmails.includes(email));
161        
162        if (newEmails.length > 0) {
163            log.info(`✅ Trouvé ${newEmails.length} nouveaux emails sur ${url}: ${newEmails.join(', ')}`);
164            
165            // Ajouter à la liste globale
166            foundEmails.push(...newEmails);
167            await Actor.setValue('foundEmails', foundEmails);
168            
169            // Organiser par domaine
170            if (!domainEmails[domain]) {
171                domainEmails[domain] = [];
172            }
173            domainEmails[domain].push(...newEmails);
174            await Actor.setValue('domainEmails', domainEmails);
175            
176            // Sauvegarder dans le dataset
177            await Actor.pushData({
178                url,
179                domain,
180                emails: newEmails,
181                emailCount: newEmails.length,
182                depth,
183                timestamp: new Date().toISOString(),
184                source: 'targeted_extraction'
185            });
186        } else {
187            log.info(`❌ Aucun nouvel email trouvé sur ${url}`);
188        }
189        
190        // Suivre les liens importants si la profondeur le permet
191        if (depth < maxDepth) {
192            const importantLinks = await page.evaluate((patterns: RegExp[]) => {
193                const links = Array.from(document.querySelectorAll('a[href]'));
194                const importantLinks: ImportantLink[] = [];
195                
196                links.forEach(link => {
197                    const href = link.getAttribute('href');
198                    const htmlElement = link as HTMLElement;
199                    const text = link.textContent || htmlElement.innerText || '';
200                    const title = link.getAttribute('title') || '';
201                    
202                    if (href && (href.startsWith('http') || href.startsWith('/'))) {
203                        const fullText = `${text} ${title} ${href}`.toLowerCase();
204                        
205                        // Vérifier si le lien correspond aux patterns importants
206                        const isImportant = patterns.some((pattern: RegExp) => pattern.test(fullText));
207                        
208                        if (isImportant) {
209                            const fullUrl = href.startsWith('/') ? 
210                                new URL(href, window.location.origin).href : href;
211                            importantLinks.push({
212                                url: fullUrl,
213                                text: text.trim()
214                            });
215                        }
216                    }
217                });
218                
219                return importantLinks;
220            }, IMPORTANT_LINK_PATTERNS);
221            
222            // Ajouter les liens importants à la queue
223            for (const link of importantLinks.slice(0, 5)) { // Limiter à 5 liens par page
224                if (!visitedUrls.includes(link.url)) {
225                    log.info(`🔗 Ajout du lien important: ${link.url} (${link.text})`);
226                    
227                    await enqueueLinks({
228                        urls: [link.url],
229                        userData: { 
230                            depth: depth + 1,
231                            maxDepth,
232                            source: url,
233                            linkText: link.text
234                        }
235                    });
236                }
237            }
238        }
239        
240    } catch (error) {
241        const errorMessage = error instanceof Error ? error.message : String(error);
242        log.error(`❌ Erreur lors du traitement de ${url}: ${errorMessage}`);
243    }
244});
245
246// Fonction principale d'exécution
247await Actor.main(async () => {
248    const input = await Actor.getInput<CrawlerInput>();
249    
250    if (!input || !Array.isArray(input.startUrls) || input.startUrls.length === 0) {
251        throw new Error('La configuration "startUrls" est requise et doit être un tableau non-vide d\'URLs');
252    }
253    
254    // Initialiser l'état global
255    await Actor.setValue('foundEmails', [] as string[]);
256    await Actor.setValue('visitedUrls', [] as string[]);
257    await Actor.setValue('domainEmails', {} as Record<string, string[]>);
258    
259    console.log('🚀 Démarrage du crawler avec les URLs suivantes:', input.startUrls);
260    
261    // Configuration
262    const maxDepth = input.maxDepth || 2;
263    const maxPagesPerDomain = input.maxPagesPerDomain || 30;
264    const followImportantLinks = input.followImportantLinks !== false;
265    
266    // Création du crawler
267    const crawler = new PlaywrightCrawler({
268        requestHandler: router,
269        
270        launchContext: {
271            launchOptions: {
272                headless: true,
273                args: [
274                    '--no-sandbox',
275                    '--disable-setuid-sandbox',
276                    '--disable-dev-shm-usage',
277                    '--disable-web-security'
278                ]
279            },
280        },
281        
282        maxRequestsPerCrawl: input.maxRequestsPerCrawl || maxPagesPerDomain,
283        navigationTimeoutSecs: 20,
284        maxConcurrency: 2,
285        
286        // Ignorer les erreurs de navigation
287        failedRequestHandler: async ({ request, error }) => {
288            const errorMessage = error instanceof Error ? error.message : String(error);
289            console.error(`❌ Échec de traitement pour ${request.url}:`, errorMessage);
290        },
291    });
292    
293    // Préparation des requêtes initiales
294    const initialRequests = input.startUrls.map((urlData) => {
295        const baseRequest = typeof urlData === 'string' ? { url: urlData } : urlData;
296        return {
297            ...baseRequest,
298            userData: { 
299                depth: 0,
300                maxDepth,
301                followImportantLinks
302            }
303        };
304    });
305    
306    try {
307        // Lancement du crawler
308        await crawler.run(initialRequests);
309        
310        // Récupération des résultats finaux
311        const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
312        const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
313        const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
314        
315        // Création du résultat final
316        const finalResult = {
317            success: true,
318            totalEmailsFound: foundEmails.length,
319            emails: foundEmails.sort(),
320            emailsByDomain: domainEmails,
321            urlsVisited: visitedUrls.length,
322            visitedUrls: visitedUrls,
323            timestamp: new Date().toISOString()
324        };
325        
326        // Sauvegarder le résultat final
327        await Actor.setValue('OUTPUT', finalResult);
328        await Actor.pushData(finalResult);
329        
330        console.log(`🎉 Crawling terminé avec succès !`);
331        console.log(`📧 ${foundEmails.length} emails uniques trouvés`);
332        console.log(`📄 ${visitedUrls.length} pages visitées`);
333        console.log(`🌐 ${Object.keys(domainEmails).length} domaines analysés`);
334        
335    } catch (error) {
336        console.error('❌ Erreur lors du crawling:', error);
337        
338        // Sauvegarder les résultats partiels
339        const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
340        const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
341        const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
342        
343        const errorMessage = error instanceof Error ? error.message : String(error);
344        
345        const finalResult = {
346            success: false,
347            totalEmailsFound: foundEmails.length,
348            emails: foundEmails.sort(),
349            emailsByDomain: domainEmails,
350            urlsVisited: visitedUrls.length,
351            visitedUrls: visitedUrls,
352            error: errorMessage,
353            timestamp: new Date().toISOString()
354        };
355        
356        await Actor.setValue('OUTPUT', finalResult);
357        await Actor.pushData(finalResult);
358        
359        console.log('⚠️ Traitement terminé avec erreur, résultats partiels sauvegardés.');
360    }
361});

.dockerignore

# configurations
.idea
.vscode

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "root": true,
    "env": {
        "browser": true,
        "es2020": true,
        "node": true
    },
    "extends": [
        "@apify/eslint-config-ts"
    ],
    "parserOptions": {
        "project": "./tsconfig.json",
        "ecmaVersion": 2020
    },
    "ignorePatterns": [
        "node_modules",
        "dist",
        "**/*.d.ts"
    ]
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
.vscode
dist
node_modules
apify_storage
storage

package.json

{
    "name": "crawlee-playwright-typescript",
    "version": "0.0.1",
    "type": "module",
    "description": "This is an example of an Apify actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.2.6",
        "crawlee": "^3.11.5",
        "playwright": "*"
    },
    "devDependencies": {
        "@apify/eslint-config-ts": "^0.3.0",
        "@apify/tsconfig": "^0.1.0",
        "@typescript-eslint/eslint-plugin": "^7.18.0",
        "@typescript-eslint/parser": "^7.18.0",
        "eslint": "^8.50.0",
        "tsx": "^4.6.2",
        "typescript": "^5.3.3"
    },
    "scripts": {
        "start": "npm run start:dev",
        "start:prod": "node dist/main.js",
        "start:dev": "tsx src/main.ts",
        "build": "tsc",
        "lint": "eslint ./src --ext .ts",
        "lint:fix": "eslint ./src --ext .ts --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
        "postinstall": "npx crawlee install-playwright-browsers"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

tsconfig.json

{
    "extends": "@apify/tsconfig",
    "compilerOptions": {
        "module": "NodeNext",
        "moduleResolution": "NodeNext",
        "target": "ES2022",
        "outDir": "dist",
        "noUnusedLocals": false,
        "skipLibCheck": true,
        "lib": ["DOM"]
    },
    "include": [
        "./src/**/*"
    ]
}