1import { Actor } from 'apify';
2import { log } from 'crawlee';
3import { chromium, type Browser } from 'playwright';
4
5
6await Actor.init();
7
8
9const rawInput = (await Actor.getInput() ?? {}) as Record<string, unknown>;
10
11const mode = (rawInput.mode ?? rawInput.scrapeMode ?? 'leaderboard') as
12 'leaderboard' | 'search' | 'topic' | 'urls';
13const leaderboardPeriod = (rawInput.leaderboardPeriod ?? rawInput.period ?? 'daily') as
14 'daily' | 'weekly' | 'monthly' | 'yearly';
15const startDate = (rawInput.startDate ?? rawInput.date ?? '') as string;
16const endDate = (rawInput.endDate ?? '') as string;
17const lookbackDays = Number(rawInput.lookbackDays ?? 1);
18const outputMode = (rawInput.outputMode ?? 'full') as 'full' | 'lean' | 'leads';
19const searchQuery = (rawInput.searchQuery ?? rawInput.query ?? rawInput.q) as string | undefined;
20const topic = (rawInput.topic ?? rawInput.topicSlug) as string | undefined;
21const startUrls = (rawInput.startUrls ?? []) as Array<{ url: string } | string>;
22const maxResults = Number(rawInput.maxResults ?? rawInput.maxItems ?? rawInput.limit ?? 1000);
23const allProducts = Boolean(rawInput.allProducts ?? rawInput.includeAll ?? false);
24const enrichEmails = Boolean(rawInput.enrichEmails ?? rawInput.findEmails ?? true);
25const hunterApiKey = (rawInput.hunterApiKey ?? rawInput.hunterKey ?? process.env.HUNTER_API_KEY ?? '') as string;
26const maxConcurrency = Math.min(Number(rawInput.maxConcurrency ?? 3), 5);
27const DEFAULT_PH_TOKEN = 'siHY_lSwsPoDFCx71pKL0Gj_jOvDIYTcaLKfgvVC8Y4';
28const phApiToken = (rawInput.phApiToken ?? rawInput.apiToken ?? process.env.PH_API_TOKEN ?? DEFAULT_PH_TOKEN) as string;
29
30
31if (!rawInput.phApiToken && !rawInput.apiToken && !process.env.PH_API_TOKEN) {
32 log.info('Using built-in PH API token. Supply your own via phApiToken input or PH_API_TOKEN env var.');
33}
34async function softExit(msg: string): Promise<never> {
35 await Actor.setStatusMessage(msg, { isStatusMessageTerminal: true, level: 'WARNING' });
36 await Actor.exit();
37 process.exit(0);
38}
39if (mode === 'search' && !searchQuery) await softExit('searchQuery required for search mode. Provide a search keyword and re-run.');
40if (mode === 'topic' && !topic) await softExit('topic required for topic mode. Provide a topic slug (e.g. "ai") and re-run.');
41if (mode === 'urls' && startUrls.length === 0) await softExit('startUrls required for urls mode. Provide one or more ProductHunt URLs and re-run.');
42
43log.info('ProductHunt Scraper starting', {
44 mode, leaderboardPeriod, startDate, endDate: endDate || '(same as startDate)',
45 lookbackDays, maxResults, allProducts, enrichEmails, outputMode,
46 emailMethod: hunterApiKey ? 'Hunter.io + Playwright fallback' : 'Playwright browser scraping',
47});
48
49
50function isoDate(d: Date) { return d.toISOString().split('T')[0]; }
51
52function getWeekBounds(d: Date) {
53 const s = new Date(d); s.setDate(d.getDate() - d.getDay());
54 const e = new Date(s); e.setDate(s.getDate() + 6);
55 return { start: isoDate(s), end: isoDate(e) };
56}
57function getMonthBounds(d: Date) {
58 const y = d.getFullYear(), m = d.getMonth();
59 return { start: isoDate(new Date(y,m,1)), end: isoDate(new Date(y,m+1,0)) };
60}
61function getYearBounds(d: Date) {
62 return { start: `${d.getFullYear()}-01-01`, end: `${d.getFullYear()}-12-31` };
63}
64
65function buildDateRange(): { postedAfter: string; postedBefore: string } | null {
66 if (mode !== 'leaderboard') return null;
67 if (lookbackDays > 1) {
68 const end = new Date(); const begin = new Date();
69 begin.setDate(end.getDate() - lookbackDays);
70 return { postedAfter: `${isoDate(begin)}T00:00:00+00:00`, postedBefore: `${isoDate(end)}T23:59:59+00:00` };
71 }
72 const base = startDate ? new Date(startDate + 'T12:00:00Z') : new Date();
73 switch (leaderboardPeriod) {
74 case 'daily': {
75 const from = startDate || isoDate(base);
76
77 const to = endDate || from;
78 return { postedAfter: `${from}T00:00:00+00:00`, postedBefore: `${to}T23:59:59+00:00` };
79 }
80 case 'weekly': {
81 const { start, end } = getWeekBounds(base);
82 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
83 }
84 case 'monthly': {
85 const { start, end } = getMonthBounds(base);
86 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
87 }
88 case 'yearly': {
89 const { start, end } = getYearBounds(base);
90 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
91 }
92 default: return null;
93 }
94}
95
96
97const PH_API = 'https://api.producthunt.com/v2/api/graphql';
98async function phQuery(query: string, variables: Record<string, unknown> = {}, attempt = 0): Promise<any> {
99 const res = await fetch(PH_API, {
100 method: 'POST',
101 headers: { 'Content-Type': 'application/json', 'Accept': 'application/json', 'Authorization': `Bearer ${phApiToken}` },
102 body: JSON.stringify({ query, variables }),
103 });
104
105 if (res.status === 429) {
106 const b = await res.text().catch(() => '{}');
107 let waitSecs = 30 * (attempt + 1);
108 try {
109 const errData = JSON.parse(b);
110 const resetIn = errData?.errors?.[0]?.details?.reset_in;
111 if (resetIn && typeof resetIn === 'number') waitSecs = Math.min(resetIn + 5, 120);
112 } catch { }
113 if (attempt >= 4) throw new Error(`PH API 429 after ${attempt + 1} retries: ${b.slice(0, 200)}`);
114 log.warning(`PH API rate limited — waiting ${waitSecs}s before retry (attempt ${attempt + 1}/5)...`);
115 await new Promise(r => setTimeout(r, waitSecs * 1000));
116 return phQuery(query, variables, attempt + 1);
117 }
118 if (!res.ok) { const b = await res.text().catch(() => ''); throw new Error(`PH API ${res.status}: ${b.slice(0,200)}`); }
119 const json = await res.json() as { data?: any; errors?: any[] };
120 if (json.errors?.length) throw new Error(`GraphQL: ${JSON.stringify(json.errors[0])}`);
121 return json.data;
122}
123
124const POST_FIELDS = `
125 id name tagline description votesCount commentsCount url website slug
126 featuredAt createdAt dailyRank weeklyRank monthlyRank yearlyRank
127 reviewsCount reviewsRating
128 thumbnail { url }
129 topics(first: 10) { edges { node { name } } }
130 makers { id name username twitterUsername url }
131`;
132
133const POSTS_QUERY = `query GetPosts($first:Int $after:String $order:PostsOrder $featured:Boolean $postedAfter:DateTime $postedBefore:DateTime $topic:String) {
134 posts(first:$first after:$after order:$order featured:$featured postedAfter:$postedAfter postedBefore:$postedBefore topic:$topic) {
135 edges { node { ${POST_FIELDS} } }
136 pageInfo { endCursor hasNextPage }
137 }
138}`;
139const SEARCH_QUERY = `query SearchPosts($query:String! $first:Int $after:String) {
140 posts(first:$first after:$after order:VOTES search:$query) {
141 edges { node { ${POST_FIELDS} } }
142 pageInfo { endCursor hasNextPage }
143 }
144}`;
145
146
147
148
149async function resolveToRealUrl(rawUrl: string): Promise<string> {
150 if (!rawUrl) return rawUrl;
151 try {
152 const cleanUrl = rawUrl.split('?')[0];
153 let res = await fetch(cleanUrl, {
154 method: 'HEAD', redirect: 'follow',
155 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
156 signal: AbortSignal.timeout(8000),
157 });
158
159 if (res.url.includes('producthunt.com')) {
160 res = await fetch(cleanUrl, {
161 method: 'GET', redirect: 'follow',
162 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
163 signal: AbortSignal.timeout(10000),
164 });
165 }
166 const resolved = res.url.split('?')[0].replace(/\/$/, '');
167 return (resolved && !resolved.includes('producthunt.com')) ? resolved : rawUrl;
168 } catch { return rawUrl; }
169}
170
171
172
173const MAX_PAGES_PER_DOMAIN = 8;
174
175const SKIP_DOMAINS = new Set([
176 'apps.apple.com', 'play.google.com', 'github.com', 'youtube.com',
177 'twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com',
178 'reddit.com', 'discord.com', 'discord.gg', 'slack.com',
179 'medium.com', 'substack.com', 'producthunt.com', 'amazon.com',
180 'chrome.google.com', 'marketplace.visualstudio.com',
181 'notion.so', 'figma.com', 'trello.com', 'airtable.com',
182]);
183
184const UA_POOL = [
185 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
186 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
187 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
188 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Safari/605.1.15',
189];
190
191
192const SEED_PATHS = [
193 '/contact',
194 '/contact-us',
195 '/about',
196 '/about-us',
197 '/team',
198 '/support',
199 '/help',
200 '/privacy',
201 '/privacy-policy',
202 '/',
203];
204
205
206function urlPriority(url: string): number {
207 const u = url.toLowerCase();
208 if (u.includes('contact')) return 100;
209 if (u.includes('about')) return 90;
210 if (u.includes('team')) return 80;
211 if (u.includes('support')) return 75;
212 if (u.includes('help')) return 70;
213 if (u.includes('company')) return 65;
214 if (u.includes('privacy')) return 60;
215 if (u.includes('legal')) return 55;
216 if (u.includes('imprint')) return 50;
217 if (u.includes('faq')) return 45;
218 if (u.includes('humans.txt')) return 40;
219 if (/^\/$|\/index/.test(u)) return 5;
220 return 20;
221}
222
223
224function browserExtractEmails(baseDomain: string): string[] {
225 const junkRe = /example\.|test@|placeholder|noreply|no-reply|\.(png|jpg|gif|svg|pdf|js|css|woff)$|@sentry\.|@logrocket\.|@segment\.|your@|email@email|\d{10,}@/i;
226 const emailRe = /\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/g;
227 const found = new Set<string>();
228
229
230 document.querySelectorAll('a[href^="mailto:"]').forEach((el) => {
231 const a = el as HTMLAnchorElement;
232 const e = a.href.replace('mailto:', '').split('?')[0].trim().toLowerCase();
233 if (e && e.includes('@') && !junkRe.test(e)) found.add(e);
234 });
235
236
237 document.querySelectorAll('script[type="application/ld+json"]').forEach((el) => {
238 try {
239 const data = JSON.parse((el as HTMLElement).textContent ?? '{}');
240 const items: any[] = Array.isArray(data) ? data : [data];
241 for (const item of items) {
242 if (item?.email && typeof item.email === 'string') {
243 const e = item.email.trim().toLowerCase();
244 if (!junkRe.test(e)) found.add(e);
245 }
246 for (const cp of (item?.contactPoint ?? [])) {
247 if (cp?.email && typeof cp.email === 'string') {
248 const e = cp.email.trim().toLowerCase();
249 if (!junkRe.test(e)) found.add(e);
250 }
251 }
252 }
253 } catch { }
254 });
255
256
257 const combined = (document.body?.innerText ?? '') + ' ' + document.documentElement.innerHTML;
258 const all = combined.match(emailRe) ?? [];
259
260
261 for (const m of all) {
262 const e = m.toLowerCase();
263 if (junkRe.test(e)) continue;
264 const d = e.split('@')[1];
265 if (d && (d === baseDomain || d.endsWith('.' + baseDomain) || baseDomain.endsWith('.' + d))) {
266 found.add(e);
267 }
268 }
269
270 if (found.size === 0) {
271 for (const m of all) {
272 const e = m.toLowerCase();
273 if (!junkRe.test(e)) found.add(e);
274 }
275 }
276
277 return [...found];
278}
279
280
281function browserExtractInternalLinks(origin: string): string[] {
282 const links = new Set<string>();
283 document.querySelectorAll('a[href]').forEach((el) => {
284 const a = el as HTMLAnchorElement;
285 try {
286 const href = a.href;
287 if (!href.startsWith(origin)) return;
288 const url = new URL(href);
289 if (url.hash) return;
290
291 const clean = url.origin + url.pathname.replace(/\/$/, '');
292 if (clean !== origin) links.add(clean);
293 } catch { }
294 });
295 return [...links];
296}
297
298
299
300async function tryHunterIo(domain: string): Promise<{ emails: string[]; confidence: number[] }> {
301 if (!hunterApiKey) return { emails: [], confidence: [] };
302 try {
303 const res = await fetch(
304 `https://api.hunter.io/v2/domain-search?domain=${encodeURIComponent(domain)}&limit=10&api_key=${encodeURIComponent(hunterApiKey)}`,
305 { signal: AbortSignal.timeout(10000) },
306 );
307 if (!res.ok) return { emails: [], confidence: [] };
308 const data = await res.json() as any;
309 const hits = (data?.data?.emails ?? [])
310 .filter((e: any) => (e.confidence ?? 0) >= 50)
311 .sort((a: any, b: any) => (b.confidence ?? 0) - (a.confidence ?? 0));
312 return {
313 emails: hits.map((e: any) => String(e.value).toLowerCase().trim()).filter(Boolean),
314 confidence: hits.map((e: any) => e.confidence ?? 0),
315 };
316 } catch { return { emails: [], confidence: [] }; }
317}
318
319
320
321
322
323type EmailSource = 'hunter_io' | 'page_scrape' | 'none';
324
325
326
327
328
329async function findEmails(
330 rawWebsiteUrl: string,
331 browser: Browser,
332 phProductUrl?: string,
333): Promise<{ emails: string[]; resolvedUrl: string; emailSource: EmailSource }> {
334
335 try { new URL(rawWebsiteUrl); } catch { return { emails: [], resolvedUrl: rawWebsiteUrl, emailSource: 'none' }; }
336
337
338 const context = await browser.newContext({
339 userAgent: UA_POOL[Math.floor(Math.random() * UA_POOL.length)],
340 locale: 'en-US',
341 viewport: { width: 1280, height: 800 },
342 extraHTTPHeaders: { 'Accept-Language': 'en-US,en;q=0.9' },
343 serviceWorkers: 'block',
344 });
345
346
347 await context.addInitScript(() => {
348 Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
349 Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
350 (window as any).chrome = { runtime: {} };
351 });
352
353
354 await context.route('**/*', (route) => {
355 const rt = route.request().resourceType();
356 if (['image', 'media', 'font', 'stylesheet'].includes(rt)) return route.abort();
357 return route.continue();
358 });
359
360 const page = await context.newPage();
361 let emails: string[] = [];
362 let resolvedUrl = rawWebsiteUrl;
363 let origin = '';
364 let baseDomain = '';
365 let pagesVisited = 0;
366
367 try {
368
369
370
371
372
373
374 let realPageUrl = rawWebsiteUrl;
375 const cleanRedirectUrl = rawWebsiteUrl.split('?')[0];
376
377
378 try {
379 const headRes = await fetch(cleanRedirectUrl, {
380 method: 'HEAD', redirect: 'follow',
381 headers: {
382 'User-Agent': UA_POOL[0],
383 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
384 'Accept-Language': 'en-US,en;q=0.9',
385 },
386 signal: AbortSignal.timeout(8000),
387 });
388 const fetchResolved = headRes.url.split('?')[0].replace(/\/$/, '');
389 if (fetchResolved && !fetchResolved.includes('producthunt.com')) {
390 realPageUrl = fetchResolved;
391 log.info(` 🔗 ${cleanRedirectUrl} → ${realPageUrl} (fetch)`);
392 }
393 } catch { }
394
395
396 if ((realPageUrl === rawWebsiteUrl || realPageUrl.includes('producthunt.com')) && phProductUrl) {
397 try {
398
399 await page.goto(phProductUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
400 await page.waitForTimeout(600);
401
402
403 const nextDataUrl = await page.evaluate(() => {
404 const nd = document.getElementById('__NEXT_DATA__');
405 if (!nd) return null;
406 try {
407 const d = JSON.parse(nd.textContent ?? '');
408 const product =
409 d?.props?.pageProps?.post ??
410 d?.props?.pageProps?.product ??
411 d?.props?.pageProps?.productData ??
412 d?.props?.pageProps?.data?.post;
413
414 const candidates = [
415 product?.website,
416 product?.websiteUrl,
417 product?.externalUrl,
418 product?.originalUrl,
419 ];
420 for (const c of candidates) {
421 if (c && typeof c === 'string' && c.startsWith('http')) return c;
422 }
423 } catch {}
424 return null;
425 }) as string | null;
426
427 if (nextDataUrl && !nextDataUrl.includes('producthunt.com')) {
428 realPageUrl = nextDataUrl.split('?')[0].replace(/\/$/, '');
429 log.info(` 📦 ${cleanRedirectUrl} → ${realPageUrl} (__NEXT_DATA__)`);
430 } else {
431
432 const redirectTarget = (nextDataUrl ?? cleanRedirectUrl).split('?')[0];
433 let capturedLoc: string | null = null;
434 page.on('response', (r) => {
435 if (r.status() >= 300 && r.status() < 400) {
436 const loc = r.headers()['location'];
437 if (loc && !loc.includes('producthunt.com') && !capturedLoc) {
438 try {
439 const abs = loc.startsWith('http') ? loc : new URL(loc, redirectTarget).href;
440 capturedLoc = abs.split('?')[0].replace(/\/$/, '');
441 } catch {}
442 }
443 }
444 });
445 await page.goto(redirectTarget, { waitUntil: 'networkidle', timeout: 20000 });
446 const navUrl = page.url().split('?')[0].replace(/\/$/, '');
447 const resolved = capturedLoc || (navUrl && !navUrl.includes('producthunt.com') ? navUrl : null);
448 if (resolved) {
449 realPageUrl = resolved;
450 log.info(` 🍪 ${cleanRedirectUrl} → ${realPageUrl} (cookie+redirect)`);
451 } else {
452 log.info(` ⚠️ Could not resolve ${cleanRedirectUrl} — staying on PH`);
453 }
454 }
455 } catch (e: any) {
456 log.debug(` URL resolution via PH page failed: ${e.message?.slice(0,80)}`);
457 }
458 }
459
460 resolvedUrl = realPageUrl;
461
462
463 if (!realPageUrl || realPageUrl.includes('producthunt.com')) {
464 return { emails: [], resolvedUrl, emailSource: 'none' };
465 }
466
467 let parsedReal: URL;
468 try { parsedReal = new URL(realPageUrl); }
469 catch { return { emails: [], resolvedUrl, emailSource: 'none' }; }
470 origin = parsedReal.origin;
471 baseDomain = parsedReal.hostname.replace(/^www\./, '');
472
473
474 const rootDomain = baseDomain.split('.').slice(-2).join('.');
475 if (SKIP_DOMAINS.has(rootDomain) || SKIP_DOMAINS.has(baseDomain)) {
476 log.debug(` [skip-domain] ${baseDomain} is in skip list — no email search`);
477 return { emails: [], resolvedUrl, emailSource: 'none' };
478 }
479
480
481 const { emails: hunterEmails } = await tryHunterIo(baseDomain);
482 if (hunterEmails.length > 0) {
483 log.info(` 🎯 Hunter.io → ${hunterEmails.length} email(s) for ${baseDomain}`);
484 return { emails: hunterEmails, resolvedUrl, emailSource: 'hunter_io' };
485 }
486
487
488
489 const currentPageUrl = page.url().split('?')[0].replace(/\/$/, '');
490 const alreadyAtRealSite = currentPageUrl.startsWith(origin);
491 if (!alreadyAtRealSite) {
492 try {
493 await page.goto(realPageUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
494 await page.waitForTimeout(1500);
495 } catch (e: any) {
496 const tag = e.message?.includes('Timeout') ? '[timeout]' : '[network]';
497 log.debug(` ${tag} Could not load ${realPageUrl}: ${e.message?.slice(0,80)}`);
498 return { emails: [], resolvedUrl, emailSource: 'none' };
499 }
500 }
501
502 pagesVisited = 1;
503
504
505 const landingEmails = await page.evaluate(browserExtractEmails, baseDomain);
506 if (landingEmails.length > 0) return { emails: landingEmails, resolvedUrl, emailSource: 'page_scrape' };
507
508
509 type QueueEntry = { url: string; priority: number };
510 const visited = new Set<string>([realPageUrl, rawWebsiteUrl]);
511 const queue: QueueEntry[] = [];
512
513 const landingLinks = await page.evaluate(browserExtractInternalLinks, origin);
514 for (const link of landingLinks) {
515 if (!visited.has(link)) {
516 const path = link.replace(origin, '') || '/';
517 queue.push({ url: link, priority: urlPriority(path) });
518 }
519 }
520
521
522 for (const path of SEED_PATHS) {
523 const url = `${origin}${path}`;
524 if (!visited.has(url) && !queue.some(q => q.url === url)) {
525 queue.push({ url, priority: urlPriority(path) });
526 }
527 }
528
529
530 queue.sort((a, b) => b.priority - a.priority);
531
532
533 while (queue.length > 0 && pagesVisited < MAX_PAGES_PER_DOMAIN) {
534 const entry = queue.shift()!;
535 if (visited.has(entry.url)) continue;
536 visited.add(entry.url);
537 pagesVisited++;
538
539 try {
540 const response = await page.goto(entry.url, {
541 waitUntil: 'domcontentloaded',
542 timeout: 14000,
543 });
544
545 const ct = response?.headers()['content-type'] ?? '';
546 if (!ct.includes('text/') && !ct.includes('html')) continue;
547
548
549 await page.waitForTimeout(1500);
550
551 const found = await page.evaluate(browserExtractEmails, baseDomain);
552 if (found.length > 0) {
553 emails = found;
554 break;
555 }
556
557
558 if (pagesVisited < MAX_PAGES_PER_DOMAIN) {
559 const links = await page.evaluate(browserExtractInternalLinks, origin);
560 for (const link of links) {
561 if (!visited.has(link) && !queue.some(q => q.url === link)) {
562 const path = link.replace(origin, '') || '/';
563 queue.push({ url: link, priority: urlPriority(path) });
564 }
565 }
566 queue.sort((a, b) => b.priority - a.priority);
567 }
568 } catch { continue; }
569 }
570 } catch (err: any) {
571 const tag = err.message?.includes('Timeout') ? '[timeout]'
572 : err.message?.includes('net::ERR') ? '[network]'
573 : '[error]';
574 log.debug(` ${tag} Navigation failed for ${rawWebsiteUrl}: ${err.message?.slice(0, 100)}`);
575 } finally {
576 await context.close();
577 }
578
579 const emailSource: EmailSource = emails.length > 0 ? 'page_scrape' : 'none';
580 log.debug(` crawled ${pagesVisited} page(s) for ${baseDomain || rawWebsiteUrl} → ${emailSource}`);
581 return { emails, resolvedUrl, emailSource };
582}
583
584
585function toInt(v: unknown): number | null {
586 if (v === null || v === undefined || v === '') return null;
587 const n = parseInt(String(v), 10);
588 return isNaN(n) ? null : n;
589}
590function toFloat(v: unknown): number | null {
591 if (v === null || v === undefined || v === '') return null;
592 const n = parseFloat(String(v));
593 return isNaN(n) ? null : n;
594}
595
596
597function nodeToRecord(node: any, rank: number): Record<string, unknown> {
598 const makers = (node.makers ?? []).map((m: any) => ({
599 maker_name: m.name ?? null,
600 maker_id: m.username ?? null,
601 maker_ph_id: m.id ?? null,
602 twitter_url: m.twitterUsername ? `https://twitter.com/${m.twitterUsername}` : null,
603 maker_url: m.url ?? null,
604 }));
605 const topics = (node.topics?.edges ?? []).map((e: any) => e?.node?.name).filter(Boolean);
606 return {
607 product_name: node.name ? node.name.split(/[-:—]/)[0].trim() : null,
608 tagline: node.tagline ?? null,
609 description: node.description ?? null,
610 upvote_count: toInt(node.votesCount),
611 comment_count: toInt(node.commentsCount),
612 reviews_count: toInt(node.reviewsCount),
613 reviews_rating: toFloat(node.reviewsRating),
614 daily_rank: toInt(node.dailyRank) ?? rank,
615 weekly_rank: toInt(node.weeklyRank),
616 monthly_rank: toInt(node.monthlyRank),
617 yearly_rank: toInt(node.yearlyRank),
618 launch_date: node.featuredAt ? new Date(node.featuredAt).toISOString().split('T')[0]
619 : node.createdAt ? new Date(node.createdAt).toISOString().split('T')[0] : null,
620 product_hunt_url: node.url ?? (node.slug ? `https://www.producthunt.com/posts/${node.slug}` : null),
621 website_url: node.website ?? null,
622 topics,
623 thumbnail_url: node.thumbnail?.url ?? null,
624 makers,
625 featured: !!node.featuredAt,
626 emails: [] as string[],
627 email_source: 'none' as string,
628 scraped_at: new Date().toISOString(),
629 };
630}
631
632
633
634
635
636const LEAN_FIELDS = new Set([
637 'product_name', 'tagline', 'website_url', 'upvote_count', 'daily_rank',
638 'emails', 'email_source', 'topics', 'launch_date', 'product_hunt_url',
639 'featured', 'scraped_at',
640]);
641
642function applyOutputMode(
643 record: Record<string, unknown>,
644 mode: 'full' | 'lean' | 'leads',
645): Record<string, unknown> | null {
646 if (mode === 'leads' && (record.emails as string[]).length === 0) return null;
647 if (mode === 'lean') {
648 const out: Record<string, unknown> = {};
649 for (const k of LEAN_FIELDS) out[k] = record[k] ?? null;
650 return out;
651 }
652 return record;
653}
654
655
656const dateRange = buildDateRange();
657
658function buildVars(after: string | null, remaining: number) {
659 const pageSize = Math.min(20, remaining);
660 if (mode === 'search') return { query: searchQuery, first: pageSize, after };
661 const vars: Record<string, unknown> = { first: pageSize, after, order: 'VOTES' };
662 if (!allProducts) vars.featured = true;
663 if (dateRange) { vars.postedAfter = dateRange.postedAfter; vars.postedBefore = dateRange.postedBefore; }
664 if (mode === 'topic') vars.topic = topic;
665 return vars;
666}
667
668
669let browser: Browser | null = null;
670async function ensureBrowser(): Promise<Browser> {
671 if (!browser) {
672 browser = await chromium.launch({
673 headless: true,
674 args: [
675 '--no-sandbox',
676 '--disable-setuid-sandbox',
677 '--disable-dev-shm-usage',
678 '--disable-accelerated-2d-canvas',
679 '--disable-gpu',
680 '--no-zygote',
681 '--disable-extensions',
682 '--disable-background-networking',
683 '--disable-default-apps',
684 '--disable-blink-features=AutomationControlled',
685 ],
686 });
687 log.info('Playwright browser launched.');
688 }
689 return browser;
690}
691
692
693async function processNode(node: any, rank: number): Promise<void> {
694 const record = nodeToRecord(node, rank);
695
696 const rawUrl = typeof record.website_url === 'string' ? record.website_url : '';
697 const phProductUrl = typeof record.product_hunt_url === 'string'
698 ? record.product_hunt_url.split('?')[0]
699 : undefined;
700
701 if (enrichEmails && rawUrl.startsWith('http')) {
702 const br = await ensureBrowser();
703 try {
704 const { emails, resolvedUrl, emailSource } = await findEmails(rawUrl, br, phProductUrl);
705 record.website_url = resolvedUrl;
706 record.emails = emails;
707 record.email_source = emailSource;
708 if (emails.length > 0) {
709 log.info(` ✉ ${record.product_name}: ${emails.length} email(s) [${emailSource}] → ${emails.join(', ')}`);
710 }
711 } catch (err: any) {
712 const msg = err.message ?? '';
713 const tag = msg.includes('Timeout') || msg.includes('timeout') ? '[timeout]'
714 : msg.includes('net::ERR') || msg.includes('ECONNREFUSED') ? '[network]'
715 : msg.includes('Navigation') ? '[navigation]'
716 : '[error]';
717 log.warning(` Email enrichment failed ${tag} for ${record.product_name}: ${msg.slice(0, 100)}`);
718 record.website_url = await resolveToRealUrl(rawUrl);
719 }
720 } else if (rawUrl.startsWith('http')) {
721 record.website_url = await resolveToRealUrl(rawUrl);
722 }
723
724 const outputRecord = applyOutputMode(record, outputMode);
725 if (outputRecord !== null) {
726 await Actor.pushData(outputRecord);
727 await Actor.charge({ eventName: 'product-scraped', count: 1 });
728 }
729}
730
731
732
733
734
735
736log.info('Fetching products from ProductHunt API (streaming)...', {
737 dateRange: dateRange ?? 'no date filter', allProducts, maxResults,
738});
739
740const seenIds = new Set<string>();
741let cursor: string | null = null;
742let pageNum = 1;
743let totalCollected = 0;
744let totalProcessed = 0;
745const batchSize = enrichEmails ? Math.min(maxConcurrency, 3) : maxConcurrency;
746
747while (totalCollected < maxResults) {
748 log.info(`Page ${pageNum} (collected ${totalCollected}/${maxResults})...`);
749 let data: any;
750 try {
751 data = await phQuery(
752 mode === 'search' ? SEARCH_QUERY : POSTS_QUERY,
753 buildVars(cursor, maxResults - totalCollected),
754 );
755 } catch (err: any) {
756 log.error(`PH API error on page ${pageNum}: ${err.message}`);
757 break;
758 }
759
760 const edges = data?.posts?.edges ?? [];
761 const pageInfo = data?.posts?.pageInfo ?? {};
762 if (!edges.length) { log.info('No more results from PH API.'); break; }
763
764
765 const pageNodes: any[] = [];
766 for (const { node } of edges) {
767 if (totalCollected >= maxResults) break;
768 if (!node?.id || seenIds.has(node.id)) continue;
769 seenIds.add(node.id);
770 pageNodes.push(node);
771 totalCollected++;
772 }
773 log.info(`Page ${pageNum}: +${pageNodes.length} products (total: ${totalCollected})`);
774
775
776 for (let i = 0; i < pageNodes.length; i += batchSize) {
777 const batch = pageNodes.slice(i, i + batchSize);
778 await Promise.all(batch.map((node, j) => processNode(node, totalProcessed + i + j + 1)));
779 }
780 totalProcessed += pageNodes.length;
781 if (totalProcessed % 50 === 0 || !pageInfo.hasNextPage) {
782 log.info(`✓ Pushed ${totalProcessed} products to dataset so far`);
783 }
784
785 if (!pageInfo.hasNextPage || !pageInfo.endCursor) { log.info('No more pages.'); break; }
786 cursor = pageInfo.endCursor;
787 pageNum++;
788}
789
790
791if (browser) {
792 await browser.close();
793 log.info('Playwright browser closed.');
794}
795
796log.info(`Done! ${totalProcessed} products scraped and pushed.`);
797await Actor.setStatusMessage(`Run complete. If this saved you time, please leave a 5★ review at apify.com/khadinakbar/producthunt-scraper-pro/issues — your feedback keeps this actor maintained and free of bugs.`);
798await Actor.exit();