1import { Actor } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4await Actor.init();
5
6const input = await Actor.getInput() ?? {};
7
8const {
9 urls = [],
10 maxPagesPerDomain = 20,
11 maxDepth = 2,
12 includePhones = true,
13 includeSocialLinks = true,
14 deduplicateEmails = true,
15} = input;
16
17const allEmails = new Map();
18const allPhones = new Set();
19const allSocials = new Map();
20
21
22const emailRegex = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
23
24
25const phoneRegex = /\+[1-9]\d{0,2}[\s\-.]\(?\d{2,4}\)?[\s\-.]?\d{3,4}[\s\-.]?\d{3,4}|\(\d{3}\)[\s\-.]?\d{3}[\s\-.]?\d{4}|\b\d{3}[\-.]\d{3}[\-.]\d{4}\b/g;
26
27
28const socialPatterns = {
29 linkedin: /(?:https?:\/\/)?(?:www\.)?linkedin\.com\/(?:in|company)\/[a-zA-Z0-9\-_.]+/gi,
30 twitter: /(?:https?:\/\/)?(?:www\.)?(?:twitter\.com|x\.com)\/[a-zA-Z0-9_]+/gi,
31 facebook: /(?:https?:\/\/)?(?:www\.)?facebook\.com\/[a-zA-Z0-9.\-]+/gi,
32 instagram: /(?:https?:\/\/)?(?:www\.)?instagram\.com\/[a-zA-Z0-9_.]+/gi,
33 youtube: /(?:https?:\/\/)?(?:www\.)?youtube\.com\/(?:@|channel\/|c\/)[a-zA-Z0-9_\-]+/gi,
34 github: /(?:https?:\/\/)?(?:www\.)?github\.com\/[a-zA-Z0-9\-]+/gi,
35 tiktok: /(?:https?:\/\/)?(?:www\.)?tiktok\.com\/@[a-zA-Z0-9_.]+/gi,
36};
37
38
39const junkEmailPatterns = [
40 /noreply@/i, /no-reply@/i, /donotreply@/i,
41 /example\.com$/i, /test\.com$/i, /localhost$/i,
42 /sentry\.io$/i, /wixpress\.com$/i, /wordpress\.com$/i,
43 /\.png$/i, /\.jpg$/i, /\.gif$/i, /\.svg$/i,
44 /@2x\./i, /@3x\./i,
45];
46
47function isValidEmail(email) {
48 if (email.length > 100) return false;
49 for (const pattern of junkEmailPatterns) {
50 if (pattern.test(email)) return false;
51 }
52 return true;
53}
54
55function cleanPhone(phone) {
56 const digits = phone.replace(/\D/g, '');
57 if (digits.length < 7 || digits.length > 15) return null;
58 return phone.trim();
59}
60
61const crawler = new CheerioCrawler({
62 maxRequestsPerCrawl: urls.length * maxPagesPerDomain,
63 maxConcurrency: 10,
64 requestHandlerTimeoutSecs: 30,
65
66 async requestHandler({ $, request, log, enqueueLinks }) {
67 const { depth = 0, sourceDomain } = request.userData;
68 log.info(`[depth=${depth}] Scanning: ${request.url}`);
69
70 const html = $.html();
71 const text = $('body').text();
72 const domain = new URL(request.url).hostname;
73
74
75 const htmlEmails = html.match(emailRegex) || [];
76 const textEmails = text.match(emailRegex) || [];
77 const allFound = [...new Set([...htmlEmails, ...textEmails])];
78
79 for (const email of allFound) {
80 const cleanEmail = email.toLowerCase().trim();
81 if (isValidEmail(cleanEmail) && !allEmails.has(cleanEmail)) {
82 allEmails.set(cleanEmail, {
83 email: cleanEmail,
84 source: request.url,
85 domain,
86 });
87 }
88 }
89
90
91 $('a[href^="mailto:"]').each((i, el) => {
92 const href = $(el).attr('href');
93 const email = href.replace('mailto:', '').split('?')[0].toLowerCase().trim();
94 if (email && isValidEmail(email) && !allEmails.has(email)) {
95 allEmails.set(email, {
96 email,
97 source: request.url,
98 domain,
99 fromMailto: true,
100 });
101 }
102 });
103
104
105 if (includePhones) {
106
107 $('a[href^="tel:"]').each((i, el) => {
108 const phone = $(el).attr('href').replace('tel:', '').trim();
109 if (phone) allPhones.add(phone);
110 });
111
112
113 const foundPhones = text.match(phoneRegex) || [];
114 for (const phone of foundPhones) {
115 const cleaned = cleanPhone(phone);
116 if (cleaned) allPhones.add(cleaned);
117 }
118 }
119
120
121 if (includeSocialLinks) {
122 for (const [platform, regex] of Object.entries(socialPatterns)) {
123 const matches = html.match(regex) || [];
124 for (const match of matches) {
125 const key = `${platform}:${match.toLowerCase()}`;
126 if (!allSocials.has(key)) {
127 allSocials.set(key, {
128 platform,
129 url: match,
130 foundOn: request.url,
131 });
132 }
133 }
134 }
135 }
136
137
138 if (depth < maxDepth) {
139 const targetDomain = sourceDomain || domain;
140 await enqueueLinks({
141 strategy: 'same-domain',
142 userData: { depth: depth + 1, sourceDomain: targetDomain },
143 transformRequestFunction: (req) => {
144
145 try {
146 const linkDomain = new URL(req.url).hostname;
147 if (linkDomain !== targetDomain) return false;
148 } catch {
149 return false;
150 }
151
152 const priorityPages = /contact|about|team|imprint|impressum|privacy|legal/i;
153 if (priorityPages.test(req.url)) {
154 req.userData.priority = 1;
155 }
156 return req;
157 },
158 });
159 }
160 },
161});
162
163
164const requests = urls.map(url => ({
165 url: url.startsWith('http') ? url : `https://${url}`,
166 userData: { depth: 0 },
167}));
168
169await crawler.addRequests(requests);
170await crawler.run();
171
172
173const emailResults = [...allEmails.values()];
174const phoneResults = [...allPhones].map(phone => ({ phone }));
175const socialResults = [...allSocials.values()];
176
177if (emailResults.length > 0) {
178
179 for (const emailData of emailResults) {
180 await Actor.pushData({
181 ...emailData,
182 phones: includePhones ? [...allPhones] : undefined,
183 socialLinks: includeSocialLinks ? socialResults.filter(s => s.foundOn === emailData.source) : undefined,
184 scrapedAt: new Date().toISOString(),
185 });
186 }
187} else {
188
189 if (phoneResults.length > 0 || socialResults.length > 0) {
190 await Actor.pushData({
191 email: null,
192 message: 'No emails found on the provided URLs',
193 phones: includePhones ? [...allPhones] : [],
194 socialLinks: includeSocialLinks ? socialResults : [],
195 urlsScanned: urls.length,
196 scrapedAt: new Date().toISOString(),
197 });
198 } else {
199 await Actor.pushData({
200 email: null,
201 message: 'No contact information found on the provided URLs',
202 urlsScanned: urls.length,
203 scrapedAt: new Date().toISOString(),
204 });
205 }
206}
207
208console.log(`\nExtraction complete!`);
209console.log(`Emails found: ${emailResults.length}`);
210console.log(`Phones found: ${phoneResults.length}`);
211console.log(`Social links found: ${socialResults.length}`);
212
213await Actor.exit();