Pricing
Pay per usage
Go to Store
Email
0.0 (0)
Pricing
Pay per usage
1
Total users
3
Monthly users
2
Runs succeeded
>99%
Last modified
10 days ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-playwright-chrome:20 AS builder
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set# in the base image.COPY . ./
# Install all dependencies and build the project.# Don't audit to speed up the installation.RUN npm run build
# Create final imageFROM apify/actor-node-playwright-chrome:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Copy built JS files from builder imageCOPY /home/myuser/dist ./dist
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-2", "title": "Project Playwright Crawler Typescript", "description": "Crawlee and Playwright project in typescript.", "version": "0.0", "meta": { "templateId": "ts-crawlee-playwright-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Email Extractor Crawler Avancé", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "URLs de départ", "type": "array", "description": "URLs à partir desquelles commencer le crawling. Le robot suivra automatiquement les liens importants (contact, mentions légales, etc.)", "editor": "requestListSources", "prefill": [ { "url": "https://www.example.com" } ] }, "maxRequestsPerCrawl": { "title": "Nombre maximum de requêtes", "type": "integer", "description": "Nombre maximum de pages à visiter au total", "default": 30, "minimum": 1, "maximum": 100 }, "maxDepth": { "title": "Profondeur maximale", "type": "integer", "description": "Profondeur maximale de navigation (0 = page d'accueil uniquement, 1 = + liens directs, 2 = + liens de niveau 2)", "default": 2, "minimum": 0, "maximum": 3 }, "maxPagesPerDomain": { "title": "Pages max par domaine", "type": "integer", "description": "Nombre maximum de pages à crawler par domaine", "default": 20, "minimum": 1, "maximum": 50 }, "followImportantLinks": { "title": "Suivre les liens importants", "type": "boolean", "description": "Suivre automatiquement les liens vers les pages contact, mentions légales, équipe, etc.", "default": true }, "excludePatterns": { "title": "Patterns à exclure", "type": "array", "description": "Extensions ou mots à exclure des emails trouvés", "editor": "stringList", "default": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"], "prefill": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"] }, "targetSelectors": { "title": "Sélecteurs CSS ciblés (optionnel)", "type": "array", "description": "Sélecteurs CSS supplémentaires pour cibler des zones spécifiques (ex: '.company-info', '#team-section')", "editor": "stringList", "default": [] }, "includeGenericEmails": { "title": "Inclure les emails génériques", "type": "boolean", "description": "Inclure les emails comme info@, contact@, support@ (généralement exclus par défaut)", "default": false }, "waitForSelector": { "title": "Attendre un sélecteur (optionnel)", "type": "string", "description": "Sélecteur CSS à attendre avant d'extraire les emails (utile pour les sites avec du JavaScript)", "editor": "textfield", "prefill": "" }, "customUserAgent": { "title": "User-Agent personnalisé (optionnel)", "type": "string", "description": "User-Agent à utiliser pour les requêtes HTTP", "editor": "textfield", "prefill": "" }, "respectRobotsTxt": { "title": "Respecter robots.txt", "type": "boolean", "description": "Respecter les directives du fichier robots.txt du site", "default": true }, "delayBetweenRequests": { "title": "Délai entre requêtes (ms)", "type": "integer", "description": "Délai en millisecondes entre chaque requête pour éviter la surcharge du serveur", "default": 1000, "minimum": 0, "maximum": 10000 }, "exportFormat": { "title": "Format d'export", "type": "string", "description": "Format de sortie des résultats", "editor": "select", "enum": ["json", "csv", "txt"], "enumTitles": ["JSON (détaillé)", "CSV (simple)", "TXT (liste simple)"], "default": "json" }, "groupByDomain": { "title": "Grouper par domaine", "type": "boolean", "description": "Organiser les résultats par nom de domaine", "default": true }, "verbose": { "title": "Mode verbeux", "type": "boolean", "description": "Afficher plus d'informations de debug dans les logs", "default": false } }, "required": ["startUrls"], "additionalProperties": false}
src/main.ts
1/**2 * Crawler avancé pour extraire des adresses emails à partir d'URLs3 * Utilise Crawlee, Playwright et Chrome headless pour une extraction intelligente.4 * Cible spécifiquement les footers, pages légales et sections importantes.5 */6
7import { Actor } from 'apify';8import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee';9
10// Expression régulière pour détecter les emails11const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;12
13// Sélecteurs pour les zones importantes14const IMPORTANT_SELECTORS = [15 'footer', '.footer', '#footer', '[class*="footer"]', '[id*="footer"]',16 '.contact', '#contact', '[class*="contact"]', '[id*="contact"]',17 '.legal', '.mentions', '.copyright', '.about', '.team', '.staff',18 '[class*="legal"]', '[class*="mention"]', '[class*="copyright"]',19 '[class*="about"]', '[class*="team"]', '[class*="staff"]',20 '.qui-sommes-nous', '.equipe', '.équipe', '.coordonnees', '.coordonnées'21];22
23// Patterns pour identifier les liens importants24const IMPORTANT_LINK_PATTERNS = [25 /contact/i, /mention/i, /legal/i, /about/i, /team/i, /staff/i,26 /qui.*sommes/i, /équipe/i, /equipe/i, /cgv/i, /cgu/i,27 /politique/i, /confidentialité/i, /confidentialite/i, /privacy/i,28 /terms/i, /conditions/i, /coordonnees/i, /coordonnées/i29];30
31// Définition des types d'entrée32interface CrawlerInput {33 startUrls: any[];34 maxRequestsPerCrawl: number;35 maxDepth?: number;36 maxPagesPerDomain?: number;37 followImportantLinks?: boolean;38 excludePatterns?: string[];39}40
41// Interface pour les liens importants42interface ImportantLink {43 url: string;44 text: string;45}46
47// Fonction pour nettoyer et valider les emails48function validateAndCleanEmails(emails: string[], excludePatterns: string[]): string[] {49 return emails50 .map(email => email.trim().toLowerCase())51 .filter(email => {52 if (!email) return false;53 54 // Exclure les emails avec des extensions d'images55 const hasImageExtension = excludePatterns.some(pattern => 56 email.includes(pattern.toLowerCase())57 );58 59 // Vérifier le format60 const isValidFormat = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/.test(email);61 62 // Exclure les emails génériques/exemples63 const isGeneric = [64 'example.com', 'test.com', 'domain.com', 'yoursite.com',65 'website.com', 'noreply@', 'no-reply@', 'placeholder',66 'dummy', 'fake', 'sample'67 ].some(generic => email.includes(generic));68 69 return !hasImageExtension && isValidFormat && !isGeneric;70 });71}72
73// Fonction pour extraire les domaines des URLs74function getDomainFromUrl(url: string): string {75 try {76 return new URL(url).hostname;77 } catch {78 return 'unknown';79 }80}81
82// Création du router avec la logique d'extraction83const router = createPlaywrightRouter();84
85router.addDefaultHandler(async ({ page, request, log, enqueueLinks }) => {86 const url = request.url;87 const depth = request.userData?.depth || 0;88 const maxDepth = request.userData?.maxDepth || 1;89 const domain = getDomainFromUrl(url);90 91 log.info(`🔍 Traitement de: ${url} (profondeur: ${depth}, domaine: ${domain})`);92 93 try {94 // Attendre le chargement de la page95 await page.waitForLoadState('networkidle', { timeout: 10000 });96 97 // Récupération des données globales98 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];99 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];100 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};101 102 // Ajouter l'URL visitée103 if (!visitedUrls.includes(url)) {104 visitedUrls.push(url);105 await Actor.setValue('visitedUrls', visitedUrls);106 }107 108 // Extraire les emails des zones importantes109 const emailsFromZones = await page.evaluate((selectors: string[]) => {110 const emails: string[] = [];111 const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;112 113 selectors.forEach(selector => {114 try {115 const elements = document.querySelectorAll(selector);116 elements.forEach(element => {117 const htmlElement = element as HTMLElement;118 const text = element.textContent || htmlElement.innerText || '';119 const html = element.innerHTML || '';120 121 // Extraire emails du texte122 const textEmails = text.match(emailRegex) || [];123 // Extraire emails du HTML (attributs href, etc.)124 const htmlEmails = html.match(emailRegex) || [];125 126 emails.push(...textEmails, ...htmlEmails);127 });128 } catch (e) {129 // Ignorer les erreurs de sélecteur130 }131 });132 133 return emails;134 }, IMPORTANT_SELECTORS);135 136 // Extraire les emails des liens mailto137 const mailtoEmails = await page.evaluate(() => {138 const mailtoLinks = document.querySelectorAll('a[href^="mailto:"]');139 return Array.from(mailtoLinks).map(link => {140 const href = link.getAttribute('href');141 return href ? href.replace('mailto:', '').split('?')[0] : '';142 }).filter(email => email);143 });144 145 // Extraire les emails du contenu textuel complet (fallback)146 const allPageText = await page.evaluate(() => {147 return document.body.innerText || document.body.textContent || '';148 });149 const allPageEmails = allPageText.match(EMAIL_REGEX) || [];150 151 // Combiner tous les emails trouvés152 const allEmails = [...new Set([...emailsFromZones, ...mailtoEmails, ...allPageEmails])];153 154 // Nettoyer et valider les emails155 const validEmails = validateAndCleanEmails(allEmails, [156 'jpg', 'png', 'pdf', 'gif', 'jpeg', 'svg', 'ico', 'webp'157 ]);158 159 // Identifier les nouveaux emails160 const newEmails = validEmails.filter(email => !foundEmails.includes(email));161 162 if (newEmails.length > 0) {163 log.info(`✅ Trouvé ${newEmails.length} nouveaux emails sur ${url}: ${newEmails.join(', ')}`);164 165 // Ajouter à la liste globale166 foundEmails.push(...newEmails);167 await Actor.setValue('foundEmails', foundEmails);168 169 // Organiser par domaine170 if (!domainEmails[domain]) {171 domainEmails[domain] = [];172 }173 domainEmails[domain].push(...newEmails);174 await Actor.setValue('domainEmails', domainEmails);175 176 // Sauvegarder dans le dataset177 await Actor.pushData({178 url,179 domain,180 emails: newEmails,181 emailCount: newEmails.length,182 depth,183 timestamp: new Date().toISOString(),184 source: 'targeted_extraction'185 });186 } else {187 log.info(`❌ Aucun nouvel email trouvé sur ${url}`);188 }189 190 // Suivre les liens importants si la profondeur le permet191 if (depth < maxDepth) {192 const importantLinks = await page.evaluate((patterns: RegExp[]) => {193 const links = Array.from(document.querySelectorAll('a[href]'));194 const importantLinks: ImportantLink[] = [];195 196 links.forEach(link => {197 const href = link.getAttribute('href');198 const htmlElement = link as HTMLElement;199 const text = link.textContent || htmlElement.innerText || '';200 const title = link.getAttribute('title') || '';201 202 if (href && (href.startsWith('http') || href.startsWith('/'))) {203 const fullText = `${text} ${title} ${href}`.toLowerCase();204 205 // Vérifier si le lien correspond aux patterns importants206 const isImportant = patterns.some((pattern: RegExp) => pattern.test(fullText));207 208 if (isImportant) {209 const fullUrl = href.startsWith('/') ? 210 new URL(href, window.location.origin).href : href;211 importantLinks.push({212 url: fullUrl,213 text: text.trim()214 });215 }216 }217 });218 219 return importantLinks;220 }, IMPORTANT_LINK_PATTERNS);221 222 // Ajouter les liens importants à la queue223 for (const link of importantLinks.slice(0, 5)) { // Limiter à 5 liens par page224 if (!visitedUrls.includes(link.url)) {225 log.info(`🔗 Ajout du lien important: ${link.url} (${link.text})`);226 227 await enqueueLinks({228 urls: [link.url],229 userData: { 230 depth: depth + 1,231 maxDepth,232 source: url,233 linkText: link.text234 }235 });236 }237 }238 }239 240 } catch (error) {241 const errorMessage = error instanceof Error ? error.message : String(error);242 log.error(`❌ Erreur lors du traitement de ${url}: ${errorMessage}`);243 }244});245
246// Fonction principale d'exécution247await Actor.main(async () => {248 const input = await Actor.getInput<CrawlerInput>();249 250 if (!input || !Array.isArray(input.startUrls) || input.startUrls.length === 0) {251 throw new Error('La configuration "startUrls" est requise et doit être un tableau non-vide d\'URLs');252 }253 254 // Initialiser l'état global255 await Actor.setValue('foundEmails', [] as string[]);256 await Actor.setValue('visitedUrls', [] as string[]);257 await Actor.setValue('domainEmails', {} as Record<string, string[]>);258 259 console.log('🚀 Démarrage du crawler avec les URLs suivantes:', input.startUrls);260 261 // Configuration262 const maxDepth = input.maxDepth || 2;263 const maxPagesPerDomain = input.maxPagesPerDomain || 30;264 const followImportantLinks = input.followImportantLinks !== false;265 266 // Création du crawler267 const crawler = new PlaywrightCrawler({268 requestHandler: router,269 270 launchContext: {271 launchOptions: {272 headless: true,273 args: [274 '--no-sandbox',275 '--disable-setuid-sandbox',276 '--disable-dev-shm-usage',277 '--disable-web-security'278 ]279 },280 },281 282 maxRequestsPerCrawl: input.maxRequestsPerCrawl || maxPagesPerDomain,283 navigationTimeoutSecs: 20,284 maxConcurrency: 2,285 286 // Ignorer les erreurs de navigation287 failedRequestHandler: async ({ request, error }) => {288 const errorMessage = error instanceof Error ? error.message : String(error);289 console.error(`❌ Échec de traitement pour ${request.url}:`, errorMessage);290 },291 });292 293 // Préparation des requêtes initiales294 const initialRequests = input.startUrls.map((urlData) => {295 const baseRequest = typeof urlData === 'string' ? { url: urlData } : urlData;296 return {297 ...baseRequest,298 userData: { 299 depth: 0,300 maxDepth,301 followImportantLinks302 }303 };304 });305 306 try {307 // Lancement du crawler308 await crawler.run(initialRequests);309 310 // Récupération des résultats finaux311 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];312 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];313 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};314 315 // Création du résultat final316 const finalResult = {317 success: true,318 totalEmailsFound: foundEmails.length,319 emails: foundEmails.sort(),320 emailsByDomain: domainEmails,321 urlsVisited: visitedUrls.length,322 visitedUrls: visitedUrls,323 timestamp: new Date().toISOString()324 };325 326 // Sauvegarder le résultat final327 await Actor.setValue('OUTPUT', finalResult);328 await Actor.pushData(finalResult);329 330 console.log(`🎉 Crawling terminé avec succès !`);331 console.log(`📧 ${foundEmails.length} emails uniques trouvés`);332 console.log(`📄 ${visitedUrls.length} pages visitées`);333 console.log(`🌐 ${Object.keys(domainEmails).length} domaines analysés`);334 335 } catch (error) {336 console.error('❌ Erreur lors du crawling:', error);337 338 // Sauvegarder les résultats partiels339 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];340 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];341 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};342 343 const errorMessage = error instanceof Error ? error.message : String(error);344 345 const finalResult = {346 success: false,347 totalEmailsFound: foundEmails.length,348 emails: foundEmails.sort(),349 emailsByDomain: domainEmails,350 urlsVisited: visitedUrls.length,351 visitedUrls: visitedUrls,352 error: errorMessage,353 timestamp: new Date().toISOString()354 };355 356 await Actor.setValue('OUTPUT', finalResult);357 await Actor.pushData(finalResult);358 359 console.log('⚠️ Traitement terminé avec erreur, résultats partiels sauvegardés.');360 }361});
.dockerignore
# configurations.idea.vscode
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "root": true, "env": { "browser": true, "es2020": true, "node": true }, "extends": [ "@apify/eslint-config-ts" ], "parserOptions": { "project": "./tsconfig.json", "ecmaVersion": 2020 }, "ignorePatterns": [ "node_modules", "dist", "**/*.d.ts" ]}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.idea.vscodedistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-playwright-typescript", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.2.6", "crawlee": "^3.11.5", "playwright": "*" }, "devDependencies": { "@apify/eslint-config-ts": "^0.3.0", "@apify/tsconfig": "^0.1.0", "@typescript-eslint/eslint-plugin": "^7.18.0", "@typescript-eslint/parser": "^7.18.0", "eslint": "^8.50.0", "tsx": "^4.6.2", "typescript": "^5.3.3" }, "scripts": { "start": "npm run start:dev", "start:prod": "node dist/main.js", "start:dev": "tsx src/main.ts", "build": "tsc", "lint": "eslint ./src --ext .ts", "lint:fix": "eslint ./src --ext .ts --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", "postinstall": "npx crawlee install-playwright-browsers" }, "author": "It's not you it's me", "license": "ISC"}
tsconfig.json
{ "extends": "@apify/tsconfig", "compilerOptions": { "module": "NodeNext", "moduleResolution": "NodeNext", "target": "ES2022", "outDir": "dist", "noUnusedLocals": false, "skipLibCheck": true, "lib": ["DOM"] }, "include": [ "./src/**/*" ]}