Email avatar
Email

Pricing

Pay per usage

Go to Store
Email

Email

Developed by

TML

TML

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

3

Monthly users

2

Runs succeeded

>99%

Last modified

10 days ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20 AS builder
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./
# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build
# Create final image
FROM apify/actor-node-playwright-chrome:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Copy built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-2",
"title": "Project Playwright Crawler Typescript",
"description": "Crawlee and Playwright project in typescript.",
"version": "0.0",
"meta": {
"templateId": "ts-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Email Extractor Crawler Avancé",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "URLs de départ",
"type": "array",
"description": "URLs à partir desquelles commencer le crawling. Le robot suivra automatiquement les liens importants (contact, mentions légales, etc.)",
"editor": "requestListSources",
"prefill": [
{
"url": "https://www.example.com"
}
]
},
"maxRequestsPerCrawl": {
"title": "Nombre maximum de requêtes",
"type": "integer",
"description": "Nombre maximum de pages à visiter au total",
"default": 30,
"minimum": 1,
"maximum": 100
},
"maxDepth": {
"title": "Profondeur maximale",
"type": "integer",
"description": "Profondeur maximale de navigation (0 = page d'accueil uniquement, 1 = + liens directs, 2 = + liens de niveau 2)",
"default": 2,
"minimum": 0,
"maximum": 3
},
"maxPagesPerDomain": {
"title": "Pages max par domaine",
"type": "integer",
"description": "Nombre maximum de pages à crawler par domaine",
"default": 20,
"minimum": 1,
"maximum": 50
},
"followImportantLinks": {
"title": "Suivre les liens importants",
"type": "boolean",
"description": "Suivre automatiquement les liens vers les pages contact, mentions légales, équipe, etc.",
"default": true
},
"excludePatterns": {
"title": "Patterns à exclure",
"type": "array",
"description": "Extensions ou mots à exclure des emails trouvés",
"editor": "stringList",
"default": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"],
"prefill": ["jpg", "png", "pdf", "gif", "jpeg", "svg", "ico", "webp"]
},
"targetSelectors": {
"title": "Sélecteurs CSS ciblés (optionnel)",
"type": "array",
"description": "Sélecteurs CSS supplémentaires pour cibler des zones spécifiques (ex: '.company-info', '#team-section')",
"editor": "stringList",
"default": []
},
"includeGenericEmails": {
"title": "Inclure les emails génériques",
"type": "boolean",
"description": "Inclure les emails comme info@, contact@, support@ (généralement exclus par défaut)",
"default": false
},
"waitForSelector": {
"title": "Attendre un sélecteur (optionnel)",
"type": "string",
"description": "Sélecteur CSS à attendre avant d'extraire les emails (utile pour les sites avec du JavaScript)",
"editor": "textfield",
"prefill": ""
},
"customUserAgent": {
"title": "User-Agent personnalisé (optionnel)",
"type": "string",
"description": "User-Agent à utiliser pour les requêtes HTTP",
"editor": "textfield",
"prefill": ""
},
"respectRobotsTxt": {
"title": "Respecter robots.txt",
"type": "boolean",
"description": "Respecter les directives du fichier robots.txt du site",
"default": true
},
"delayBetweenRequests": {
"title": "Délai entre requêtes (ms)",
"type": "integer",
"description": "Délai en millisecondes entre chaque requête pour éviter la surcharge du serveur",
"default": 1000,
"minimum": 0,
"maximum": 10000
},
"exportFormat": {
"title": "Format d'export",
"type": "string",
"description": "Format de sortie des résultats",
"editor": "select",
"enum": ["json", "csv", "txt"],
"enumTitles": ["JSON (détaillé)", "CSV (simple)", "TXT (liste simple)"],
"default": "json"
},
"groupByDomain": {
"title": "Grouper par domaine",
"type": "boolean",
"description": "Organiser les résultats par nom de domaine",
"default": true
},
"verbose": {
"title": "Mode verbeux",
"type": "boolean",
"description": "Afficher plus d'informations de debug dans les logs",
"default": false
}
},
"required": ["startUrls"],
"additionalProperties": false
}

src/main.ts

1/**
2 * Crawler avancé pour extraire des adresses emails à partir d'URLs
3 * Utilise Crawlee, Playwright et Chrome headless pour une extraction intelligente.
4 * Cible spécifiquement les footers, pages légales et sections importantes.
5 */
6
7import { Actor } from 'apify';
8import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee';
9
10// Expression régulière pour détecter les emails
11const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
12
13// Sélecteurs pour les zones importantes
14const IMPORTANT_SELECTORS = [
15 'footer', '.footer', '#footer', '[class*="footer"]', '[id*="footer"]',
16 '.contact', '#contact', '[class*="contact"]', '[id*="contact"]',
17 '.legal', '.mentions', '.copyright', '.about', '.team', '.staff',
18 '[class*="legal"]', '[class*="mention"]', '[class*="copyright"]',
19 '[class*="about"]', '[class*="team"]', '[class*="staff"]',
20 '.qui-sommes-nous', '.equipe', '.équipe', '.coordonnees', '.coordonnées'
21];
22
23// Patterns pour identifier les liens importants
24const IMPORTANT_LINK_PATTERNS = [
25 /contact/i, /mention/i, /legal/i, /about/i, /team/i, /staff/i,
26 /qui.*sommes/i, /équipe/i, /equipe/i, /cgv/i, /cgu/i,
27 /politique/i, /confidentialité/i, /confidentialite/i, /privacy/i,
28 /terms/i, /conditions/i, /coordonnees/i, /coordonnées/i
29];
30
31// Définition des types d'entrée
32interface CrawlerInput {
33 startUrls: any[];
34 maxRequestsPerCrawl: number;
35 maxDepth?: number;
36 maxPagesPerDomain?: number;
37 followImportantLinks?: boolean;
38 excludePatterns?: string[];
39}
40
41// Interface pour les liens importants
42interface ImportantLink {
43 url: string;
44 text: string;
45}
46
47// Fonction pour nettoyer et valider les emails
48function validateAndCleanEmails(emails: string[], excludePatterns: string[]): string[] {
49 return emails
50 .map(email => email.trim().toLowerCase())
51 .filter(email => {
52 if (!email) return false;
53
54 // Exclure les emails avec des extensions d'images
55 const hasImageExtension = excludePatterns.some(pattern =>
56 email.includes(pattern.toLowerCase())
57 );
58
59 // Vérifier le format
60 const isValidFormat = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/.test(email);
61
62 // Exclure les emails génériques/exemples
63 const isGeneric = [
64 'example.com', 'test.com', 'domain.com', 'yoursite.com',
65 'website.com', 'noreply@', 'no-reply@', 'placeholder',
66 'dummy', 'fake', 'sample'
67 ].some(generic => email.includes(generic));
68
69 return !hasImageExtension && isValidFormat && !isGeneric;
70 });
71}
72
73// Fonction pour extraire les domaines des URLs
74function getDomainFromUrl(url: string): string {
75 try {
76 return new URL(url).hostname;
77 } catch {
78 return 'unknown';
79 }
80}
81
82// Création du router avec la logique d'extraction
83const router = createPlaywrightRouter();
84
85router.addDefaultHandler(async ({ page, request, log, enqueueLinks }) => {
86 const url = request.url;
87 const depth = request.userData?.depth || 0;
88 const maxDepth = request.userData?.maxDepth || 1;
89 const domain = getDomainFromUrl(url);
90
91 log.info(`🔍 Traitement de: ${url} (profondeur: ${depth}, domaine: ${domain})`);
92
93 try {
94 // Attendre le chargement de la page
95 await page.waitForLoadState('networkidle', { timeout: 10000 });
96
97 // Récupération des données globales
98 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
99 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
100 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
101
102 // Ajouter l'URL visitée
103 if (!visitedUrls.includes(url)) {
104 visitedUrls.push(url);
105 await Actor.setValue('visitedUrls', visitedUrls);
106 }
107
108 // Extraire les emails des zones importantes
109 const emailsFromZones = await page.evaluate((selectors: string[]) => {
110 const emails: string[] = [];
111 const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
112
113 selectors.forEach(selector => {
114 try {
115 const elements = document.querySelectorAll(selector);
116 elements.forEach(element => {
117 const htmlElement = element as HTMLElement;
118 const text = element.textContent || htmlElement.innerText || '';
119 const html = element.innerHTML || '';
120
121 // Extraire emails du texte
122 const textEmails = text.match(emailRegex) || [];
123 // Extraire emails du HTML (attributs href, etc.)
124 const htmlEmails = html.match(emailRegex) || [];
125
126 emails.push(...textEmails, ...htmlEmails);
127 });
128 } catch (e) {
129 // Ignorer les erreurs de sélecteur
130 }
131 });
132
133 return emails;
134 }, IMPORTANT_SELECTORS);
135
136 // Extraire les emails des liens mailto
137 const mailtoEmails = await page.evaluate(() => {
138 const mailtoLinks = document.querySelectorAll('a[href^="mailto:"]');
139 return Array.from(mailtoLinks).map(link => {
140 const href = link.getAttribute('href');
141 return href ? href.replace('mailto:', '').split('?')[0] : '';
142 }).filter(email => email);
143 });
144
145 // Extraire les emails du contenu textuel complet (fallback)
146 const allPageText = await page.evaluate(() => {
147 return document.body.innerText || document.body.textContent || '';
148 });
149 const allPageEmails = allPageText.match(EMAIL_REGEX) || [];
150
151 // Combiner tous les emails trouvés
152 const allEmails = [...new Set([...emailsFromZones, ...mailtoEmails, ...allPageEmails])];
153
154 // Nettoyer et valider les emails
155 const validEmails = validateAndCleanEmails(allEmails, [
156 'jpg', 'png', 'pdf', 'gif', 'jpeg', 'svg', 'ico', 'webp'
157 ]);
158
159 // Identifier les nouveaux emails
160 const newEmails = validEmails.filter(email => !foundEmails.includes(email));
161
162 if (newEmails.length > 0) {
163 log.info(`✅ Trouvé ${newEmails.length} nouveaux emails sur ${url}: ${newEmails.join(', ')}`);
164
165 // Ajouter à la liste globale
166 foundEmails.push(...newEmails);
167 await Actor.setValue('foundEmails', foundEmails);
168
169 // Organiser par domaine
170 if (!domainEmails[domain]) {
171 domainEmails[domain] = [];
172 }
173 domainEmails[domain].push(...newEmails);
174 await Actor.setValue('domainEmails', domainEmails);
175
176 // Sauvegarder dans le dataset
177 await Actor.pushData({
178 url,
179 domain,
180 emails: newEmails,
181 emailCount: newEmails.length,
182 depth,
183 timestamp: new Date().toISOString(),
184 source: 'targeted_extraction'
185 });
186 } else {
187 log.info(`❌ Aucun nouvel email trouvé sur ${url}`);
188 }
189
190 // Suivre les liens importants si la profondeur le permet
191 if (depth < maxDepth) {
192 const importantLinks = await page.evaluate((patterns: RegExp[]) => {
193 const links = Array.from(document.querySelectorAll('a[href]'));
194 const importantLinks: ImportantLink[] = [];
195
196 links.forEach(link => {
197 const href = link.getAttribute('href');
198 const htmlElement = link as HTMLElement;
199 const text = link.textContent || htmlElement.innerText || '';
200 const title = link.getAttribute('title') || '';
201
202 if (href && (href.startsWith('http') || href.startsWith('/'))) {
203 const fullText = `${text} ${title} ${href}`.toLowerCase();
204
205 // Vérifier si le lien correspond aux patterns importants
206 const isImportant = patterns.some((pattern: RegExp) => pattern.test(fullText));
207
208 if (isImportant) {
209 const fullUrl = href.startsWith('/') ?
210 new URL(href, window.location.origin).href : href;
211 importantLinks.push({
212 url: fullUrl,
213 text: text.trim()
214 });
215 }
216 }
217 });
218
219 return importantLinks;
220 }, IMPORTANT_LINK_PATTERNS);
221
222 // Ajouter les liens importants à la queue
223 for (const link of importantLinks.slice(0, 5)) { // Limiter à 5 liens par page
224 if (!visitedUrls.includes(link.url)) {
225 log.info(`🔗 Ajout du lien important: ${link.url} (${link.text})`);
226
227 await enqueueLinks({
228 urls: [link.url],
229 userData: {
230 depth: depth + 1,
231 maxDepth,
232 source: url,
233 linkText: link.text
234 }
235 });
236 }
237 }
238 }
239
240 } catch (error) {
241 const errorMessage = error instanceof Error ? error.message : String(error);
242 log.error(`❌ Erreur lors du traitement de ${url}: ${errorMessage}`);
243 }
244});
245
246// Fonction principale d'exécution
247await Actor.main(async () => {
248 const input = await Actor.getInput<CrawlerInput>();
249
250 if (!input || !Array.isArray(input.startUrls) || input.startUrls.length === 0) {
251 throw new Error('La configuration "startUrls" est requise et doit être un tableau non-vide d\'URLs');
252 }
253
254 // Initialiser l'état global
255 await Actor.setValue('foundEmails', [] as string[]);
256 await Actor.setValue('visitedUrls', [] as string[]);
257 await Actor.setValue('domainEmails', {} as Record<string, string[]>);
258
259 console.log('🚀 Démarrage du crawler avec les URLs suivantes:', input.startUrls);
260
261 // Configuration
262 const maxDepth = input.maxDepth || 2;
263 const maxPagesPerDomain = input.maxPagesPerDomain || 30;
264 const followImportantLinks = input.followImportantLinks !== false;
265
266 // Création du crawler
267 const crawler = new PlaywrightCrawler({
268 requestHandler: router,
269
270 launchContext: {
271 launchOptions: {
272 headless: true,
273 args: [
274 '--no-sandbox',
275 '--disable-setuid-sandbox',
276 '--disable-dev-shm-usage',
277 '--disable-web-security'
278 ]
279 },
280 },
281
282 maxRequestsPerCrawl: input.maxRequestsPerCrawl || maxPagesPerDomain,
283 navigationTimeoutSecs: 20,
284 maxConcurrency: 2,
285
286 // Ignorer les erreurs de navigation
287 failedRequestHandler: async ({ request, error }) => {
288 const errorMessage = error instanceof Error ? error.message : String(error);
289 console.error(`❌ Échec de traitement pour ${request.url}:`, errorMessage);
290 },
291 });
292
293 // Préparation des requêtes initiales
294 const initialRequests = input.startUrls.map((urlData) => {
295 const baseRequest = typeof urlData === 'string' ? { url: urlData } : urlData;
296 return {
297 ...baseRequest,
298 userData: {
299 depth: 0,
300 maxDepth,
301 followImportantLinks
302 }
303 };
304 });
305
306 try {
307 // Lancement du crawler
308 await crawler.run(initialRequests);
309
310 // Récupération des résultats finaux
311 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
312 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
313 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
314
315 // Création du résultat final
316 const finalResult = {
317 success: true,
318 totalEmailsFound: foundEmails.length,
319 emails: foundEmails.sort(),
320 emailsByDomain: domainEmails,
321 urlsVisited: visitedUrls.length,
322 visitedUrls: visitedUrls,
323 timestamp: new Date().toISOString()
324 };
325
326 // Sauvegarder le résultat final
327 await Actor.setValue('OUTPUT', finalResult);
328 await Actor.pushData(finalResult);
329
330 console.log(`🎉 Crawling terminé avec succès !`);
331 console.log(`📧 ${foundEmails.length} emails uniques trouvés`);
332 console.log(`📄 ${visitedUrls.length} pages visitées`);
333 console.log(`🌐 ${Object.keys(domainEmails).length} domaines analysés`);
334
335 } catch (error) {
336 console.error('❌ Erreur lors du crawling:', error);
337
338 // Sauvegarder les résultats partiels
339 const foundEmails = await Actor.getValue('foundEmails') as string[] || [];
340 const visitedUrls = await Actor.getValue('visitedUrls') as string[] || [];
341 const domainEmails = await Actor.getValue('domainEmails') as Record<string, string[]> || {};
342
343 const errorMessage = error instanceof Error ? error.message : String(error);
344
345 const finalResult = {
346 success: false,
347 totalEmailsFound: foundEmails.length,
348 emails: foundEmails.sort(),
349 emailsByDomain: domainEmails,
350 urlsVisited: visitedUrls.length,
351 visitedUrls: visitedUrls,
352 error: errorMessage,
353 timestamp: new Date().toISOString()
354 };
355
356 await Actor.setValue('OUTPUT', finalResult);
357 await Actor.pushData(finalResult);
358
359 console.log('⚠️ Traitement terminé avec erreur, résultats partiels sauvegardés.');
360 }
361});

.dockerignore

# configurations
.idea
.vscode
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"root": true,
"env": {
"browser": true,
"es2020": true,
"node": true
},
"extends": [
"@apify/eslint-config-ts"
],
"parserOptions": {
"project": "./tsconfig.json",
"ecmaVersion": 2020
},
"ignorePatterns": [
"node_modules",
"dist",
"**/*.d.ts"
]
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
.vscode
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-playwright-typescript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.2.6",
"crawlee": "^3.11.5",
"playwright": "*"
},
"devDependencies": {
"@apify/eslint-config-ts": "^0.3.0",
"@apify/tsconfig": "^0.1.0",
"@typescript-eslint/eslint-plugin": "^7.18.0",
"@typescript-eslint/parser": "^7.18.0",
"eslint": "^8.50.0",
"tsx": "^4.6.2",
"typescript": "^5.3.3"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"lint": "eslint ./src --ext .ts",
"lint:fix": "eslint ./src --ext .ts --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "It's not you it's me",
"license": "ISC"
}

tsconfig.json

{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"skipLibCheck": true,
"lib": ["DOM"]
},
"include": [
"./src/**/*"
]
}