1import { Actor } from 'apify';
2import { log } from 'crawlee';
3import { chromium, type Browser } from 'playwright';
4
5
6await Actor.init();
7
8
9const rawInput = (await Actor.getInput() ?? {}) as Record<string, unknown>;
10
11const mode = (rawInput.mode ?? rawInput.scrapeMode ?? 'leaderboard') as
12 'leaderboard' | 'search' | 'topic' | 'urls';
13const leaderboardPeriod = (rawInput.leaderboardPeriod ?? rawInput.period ?? 'daily') as
14 'daily' | 'weekly' | 'monthly' | 'yearly';
15const startDate = (rawInput.startDate ?? rawInput.date ?? '') as string;
16const endDate = (rawInput.endDate ?? '') as string;
17const lookbackDays = Number(rawInput.lookbackDays ?? 1);
18const outputMode = (rawInput.outputMode ?? 'full') as 'full' | 'lean' | 'leads';
19const searchQuery = (rawInput.searchQuery ?? rawInput.query ?? rawInput.q) as string | undefined;
20const topic = (rawInput.topic ?? rawInput.topicSlug) as string | undefined;
21const startUrls = (rawInput.startUrls ?? []) as Array<{ url: string } | string>;
22const maxResults = Number(rawInput.maxResults ?? rawInput.maxItems ?? rawInput.limit ?? 1000);
23const allProducts = Boolean(rawInput.allProducts ?? rawInput.includeAll ?? false);
24const enrichEmails = Boolean(rawInput.enrichEmails ?? rawInput.findEmails ?? true);
25const hunterApiKey = (rawInput.hunterApiKey ?? rawInput.hunterKey ?? process.env.HUNTER_API_KEY ?? '') as string;
26const maxConcurrency = Math.min(Number(rawInput.maxConcurrency ?? 3), 5);
27const DEFAULT_PH_TOKEN = 'siHY_lSwsPoDFCx71pKL0Gj_jOvDIYTcaLKfgvVC8Y4';
28const phApiToken = (rawInput.phApiToken ?? rawInput.apiToken ?? process.env.PH_API_TOKEN ?? DEFAULT_PH_TOKEN) as string;
29
30
31if (!rawInput.phApiToken && !rawInput.apiToken && !process.env.PH_API_TOKEN) {
32 log.info('Using built-in PH API token. Supply your own via phApiToken input or PH_API_TOKEN env var.');
33}
34if (mode === 'search' && !searchQuery) await Actor.fail('searchQuery required for search mode.');
35if (mode === 'topic' && !topic) await Actor.fail('topic required for topic mode.');
36if (mode === 'urls' && startUrls.length === 0) await Actor.fail('startUrls required for urls mode.');
37
38log.info('ProductHunt Scraper starting', {
39 mode, leaderboardPeriod, startDate, endDate: endDate || '(same as startDate)',
40 lookbackDays, maxResults, allProducts, enrichEmails, outputMode,
41 emailMethod: hunterApiKey ? 'Hunter.io + Playwright fallback' : 'Playwright browser scraping',
42});
43
44
45function isoDate(d: Date) { return d.toISOString().split('T')[0]; }
46
47function getWeekBounds(d: Date) {
48 const s = new Date(d); s.setDate(d.getDate() - d.getDay());
49 const e = new Date(s); e.setDate(s.getDate() + 6);
50 return { start: isoDate(s), end: isoDate(e) };
51}
52function getMonthBounds(d: Date) {
53 const y = d.getFullYear(), m = d.getMonth();
54 return { start: isoDate(new Date(y,m,1)), end: isoDate(new Date(y,m+1,0)) };
55}
56function getYearBounds(d: Date) {
57 return { start: `${d.getFullYear()}-01-01`, end: `${d.getFullYear()}-12-31` };
58}
59
60function buildDateRange(): { postedAfter: string; postedBefore: string } | null {
61 if (mode !== 'leaderboard') return null;
62 if (lookbackDays > 1) {
63 const end = new Date(); const begin = new Date();
64 begin.setDate(end.getDate() - lookbackDays);
65 return { postedAfter: `${isoDate(begin)}T00:00:00+00:00`, postedBefore: `${isoDate(end)}T23:59:59+00:00` };
66 }
67 const base = startDate ? new Date(startDate + 'T12:00:00Z') : new Date();
68 switch (leaderboardPeriod) {
69 case 'daily': {
70 const from = startDate || isoDate(base);
71
72 const to = endDate || from;
73 return { postedAfter: `${from}T00:00:00+00:00`, postedBefore: `${to}T23:59:59+00:00` };
74 }
75 case 'weekly': {
76 const { start, end } = getWeekBounds(base);
77 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
78 }
79 case 'monthly': {
80 const { start, end } = getMonthBounds(base);
81 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
82 }
83 case 'yearly': {
84 const { start, end } = getYearBounds(base);
85 return { postedAfter: `${start}T00:00:00+00:00`, postedBefore: `${end}T23:59:59+00:00` };
86 }
87 default: return null;
88 }
89}
90
91
92const PH_API = 'https://api.producthunt.com/v2/api/graphql';
93async function phQuery(query: string, variables: Record<string, unknown> = {}, attempt = 0): Promise<any> {
94 const res = await fetch(PH_API, {
95 method: 'POST',
96 headers: { 'Content-Type': 'application/json', 'Accept': 'application/json', 'Authorization': `Bearer ${phApiToken}` },
97 body: JSON.stringify({ query, variables }),
98 });
99
100 if (res.status === 429) {
101 const b = await res.text().catch(() => '{}');
102 let waitSecs = 30 * (attempt + 1);
103 try {
104 const errData = JSON.parse(b);
105 const resetIn = errData?.errors?.[0]?.details?.reset_in;
106 if (resetIn && typeof resetIn === 'number') waitSecs = Math.min(resetIn + 5, 120);
107 } catch { }
108 if (attempt >= 4) throw new Error(`PH API 429 after ${attempt + 1} retries: ${b.slice(0, 200)}`);
109 log.warning(`PH API rate limited — waiting ${waitSecs}s before retry (attempt ${attempt + 1}/5)...`);
110 await new Promise(r => setTimeout(r, waitSecs * 1000));
111 return phQuery(query, variables, attempt + 1);
112 }
113 if (!res.ok) { const b = await res.text().catch(() => ''); throw new Error(`PH API ${res.status}: ${b.slice(0,200)}`); }
114 const json = await res.json() as { data?: any; errors?: any[] };
115 if (json.errors?.length) throw new Error(`GraphQL: ${JSON.stringify(json.errors[0])}`);
116 return json.data;
117}
118
119const POST_FIELDS = `
120 id name tagline description votesCount commentsCount url website slug
121 featuredAt createdAt dailyRank weeklyRank monthlyRank yearlyRank
122 reviewsCount reviewsRating
123 thumbnail { url }
124 topics(first: 10) { edges { node { name } } }
125 makers { id name username twitterUsername url }
126`;
127
128const POSTS_QUERY = `query GetPosts($first:Int $after:String $order:PostsOrder $featured:Boolean $postedAfter:DateTime $postedBefore:DateTime $topic:String) {
129 posts(first:$first after:$after order:$order featured:$featured postedAfter:$postedAfter postedBefore:$postedBefore topic:$topic) {
130 edges { node { ${POST_FIELDS} } }
131 pageInfo { endCursor hasNextPage }
132 }
133}`;
134const SEARCH_QUERY = `query SearchPosts($query:String! $first:Int $after:String) {
135 posts(first:$first after:$after order:VOTES search:$query) {
136 edges { node { ${POST_FIELDS} } }
137 pageInfo { endCursor hasNextPage }
138 }
139}`;
140
141
142
143
144async function resolveToRealUrl(rawUrl: string): Promise<string> {
145 if (!rawUrl) return rawUrl;
146 try {
147 const cleanUrl = rawUrl.split('?')[0];
148 let res = await fetch(cleanUrl, {
149 method: 'HEAD', redirect: 'follow',
150 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
151 signal: AbortSignal.timeout(8000),
152 });
153
154 if (res.url.includes('producthunt.com')) {
155 res = await fetch(cleanUrl, {
156 method: 'GET', redirect: 'follow',
157 headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
158 signal: AbortSignal.timeout(10000),
159 });
160 }
161 const resolved = res.url.split('?')[0].replace(/\/$/, '');
162 return (resolved && !resolved.includes('producthunt.com')) ? resolved : rawUrl;
163 } catch { return rawUrl; }
164}
165
166
167
168const MAX_PAGES_PER_DOMAIN = 8;
169
170const SKIP_DOMAINS = new Set([
171 'apps.apple.com', 'play.google.com', 'github.com', 'youtube.com',
172 'twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com',
173 'reddit.com', 'discord.com', 'discord.gg', 'slack.com',
174 'medium.com', 'substack.com', 'producthunt.com', 'amazon.com',
175 'chrome.google.com', 'marketplace.visualstudio.com',
176 'notion.so', 'figma.com', 'trello.com', 'airtable.com',
177]);
178
179const UA_POOL = [
180 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
181 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
182 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
183 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Safari/605.1.15',
184];
185
186
187const SEED_PATHS = [
188 '/contact',
189 '/contact-us',
190 '/about',
191 '/about-us',
192 '/team',
193 '/support',
194 '/help',
195 '/privacy',
196 '/privacy-policy',
197 '/',
198];
199
200
201function urlPriority(url: string): number {
202 const u = url.toLowerCase();
203 if (u.includes('contact')) return 100;
204 if (u.includes('about')) return 90;
205 if (u.includes('team')) return 80;
206 if (u.includes('support')) return 75;
207 if (u.includes('help')) return 70;
208 if (u.includes('company')) return 65;
209 if (u.includes('privacy')) return 60;
210 if (u.includes('legal')) return 55;
211 if (u.includes('imprint')) return 50;
212 if (u.includes('faq')) return 45;
213 if (u.includes('humans.txt')) return 40;
214 if (/^\/$|\/index/.test(u)) return 5;
215 return 20;
216}
217
218
219function browserExtractEmails(baseDomain: string): string[] {
220 const junkRe = /example\.|test@|placeholder|noreply|no-reply|\.(png|jpg|gif|svg|pdf|js|css|woff)$|@sentry\.|@logrocket\.|@segment\.|your@|email@email|\d{10,}@/i;
221 const emailRe = /\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b/g;
222 const found = new Set<string>();
223
224
225 document.querySelectorAll('a[href^="mailto:"]').forEach((el) => {
226 const a = el as HTMLAnchorElement;
227 const e = a.href.replace('mailto:', '').split('?')[0].trim().toLowerCase();
228 if (e && e.includes('@') && !junkRe.test(e)) found.add(e);
229 });
230
231
232 document.querySelectorAll('script[type="application/ld+json"]').forEach((el) => {
233 try {
234 const data = JSON.parse((el as HTMLElement).textContent ?? '{}');
235 const items: any[] = Array.isArray(data) ? data : [data];
236 for (const item of items) {
237 if (item?.email && typeof item.email === 'string') {
238 const e = item.email.trim().toLowerCase();
239 if (!junkRe.test(e)) found.add(e);
240 }
241 for (const cp of (item?.contactPoint ?? [])) {
242 if (cp?.email && typeof cp.email === 'string') {
243 const e = cp.email.trim().toLowerCase();
244 if (!junkRe.test(e)) found.add(e);
245 }
246 }
247 }
248 } catch { }
249 });
250
251
252 const combined = (document.body?.innerText ?? '') + ' ' + document.documentElement.innerHTML;
253 const all = combined.match(emailRe) ?? [];
254
255
256 for (const m of all) {
257 const e = m.toLowerCase();
258 if (junkRe.test(e)) continue;
259 const d = e.split('@')[1];
260 if (d && (d === baseDomain || d.endsWith('.' + baseDomain) || baseDomain.endsWith('.' + d))) {
261 found.add(e);
262 }
263 }
264
265 if (found.size === 0) {
266 for (const m of all) {
267 const e = m.toLowerCase();
268 if (!junkRe.test(e)) found.add(e);
269 }
270 }
271
272 return [...found];
273}
274
275
276function browserExtractInternalLinks(origin: string): string[] {
277 const links = new Set<string>();
278 document.querySelectorAll('a[href]').forEach((el) => {
279 const a = el as HTMLAnchorElement;
280 try {
281 const href = a.href;
282 if (!href.startsWith(origin)) return;
283 const url = new URL(href);
284 if (url.hash) return;
285
286 const clean = url.origin + url.pathname.replace(/\/$/, '');
287 if (clean !== origin) links.add(clean);
288 } catch { }
289 });
290 return [...links];
291}
292
293
294
295async function tryHunterIo(domain: string): Promise<{ emails: string[]; confidence: number[] }> {
296 if (!hunterApiKey) return { emails: [], confidence: [] };
297 try {
298 const res = await fetch(
299 `https://api.hunter.io/v2/domain-search?domain=${encodeURIComponent(domain)}&limit=10&api_key=${encodeURIComponent(hunterApiKey)}`,
300 { signal: AbortSignal.timeout(10000) },
301 );
302 if (!res.ok) return { emails: [], confidence: [] };
303 const data = await res.json() as any;
304 const hits = (data?.data?.emails ?? [])
305 .filter((e: any) => (e.confidence ?? 0) >= 50)
306 .sort((a: any, b: any) => (b.confidence ?? 0) - (a.confidence ?? 0));
307 return {
308 emails: hits.map((e: any) => String(e.value).toLowerCase().trim()).filter(Boolean),
309 confidence: hits.map((e: any) => e.confidence ?? 0),
310 };
311 } catch { return { emails: [], confidence: [] }; }
312}
313
314
315
316
317
318type EmailSource = 'hunter_io' | 'page_scrape' | 'none';
319
320
321
322
323
324async function findEmails(
325 rawWebsiteUrl: string,
326 browser: Browser,
327 phProductUrl?: string,
328): Promise<{ emails: string[]; resolvedUrl: string; emailSource: EmailSource }> {
329
330 try { new URL(rawWebsiteUrl); } catch { return { emails: [], resolvedUrl: rawWebsiteUrl, emailSource: 'none' }; }
331
332
333 const context = await browser.newContext({
334 userAgent: UA_POOL[Math.floor(Math.random() * UA_POOL.length)],
335 locale: 'en-US',
336 viewport: { width: 1280, height: 800 },
337 extraHTTPHeaders: { 'Accept-Language': 'en-US,en;q=0.9' },
338 serviceWorkers: 'block',
339 });
340
341
342 await context.addInitScript(() => {
343 Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
344 Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
345 (window as any).chrome = { runtime: {} };
346 });
347
348
349 await context.route('**/*', (route) => {
350 const rt = route.request().resourceType();
351 if (['image', 'media', 'font', 'stylesheet'].includes(rt)) return route.abort();
352 return route.continue();
353 });
354
355 const page = await context.newPage();
356 let emails: string[] = [];
357 let resolvedUrl = rawWebsiteUrl;
358 let origin = '';
359 let baseDomain = '';
360 let pagesVisited = 0;
361
362 try {
363
364
365
366
367
368
369 let realPageUrl = rawWebsiteUrl;
370 const cleanRedirectUrl = rawWebsiteUrl.split('?')[0];
371
372
373 try {
374 const headRes = await fetch(cleanRedirectUrl, {
375 method: 'HEAD', redirect: 'follow',
376 headers: {
377 'User-Agent': UA_POOL[0],
378 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
379 'Accept-Language': 'en-US,en;q=0.9',
380 },
381 signal: AbortSignal.timeout(8000),
382 });
383 const fetchResolved = headRes.url.split('?')[0].replace(/\/$/, '');
384 if (fetchResolved && !fetchResolved.includes('producthunt.com')) {
385 realPageUrl = fetchResolved;
386 log.info(` 🔗 ${cleanRedirectUrl} → ${realPageUrl} (fetch)`);
387 }
388 } catch { }
389
390
391 if ((realPageUrl === rawWebsiteUrl || realPageUrl.includes('producthunt.com')) && phProductUrl) {
392 try {
393
394 await page.goto(phProductUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
395 await page.waitForTimeout(600);
396
397
398 const nextDataUrl = await page.evaluate(() => {
399 const nd = document.getElementById('__NEXT_DATA__');
400 if (!nd) return null;
401 try {
402 const d = JSON.parse(nd.textContent ?? '');
403 const product =
404 d?.props?.pageProps?.post ??
405 d?.props?.pageProps?.product ??
406 d?.props?.pageProps?.productData ??
407 d?.props?.pageProps?.data?.post;
408
409 const candidates = [
410 product?.website,
411 product?.websiteUrl,
412 product?.externalUrl,
413 product?.originalUrl,
414 ];
415 for (const c of candidates) {
416 if (c && typeof c === 'string' && c.startsWith('http')) return c;
417 }
418 } catch {}
419 return null;
420 }) as string | null;
421
422 if (nextDataUrl && !nextDataUrl.includes('producthunt.com')) {
423 realPageUrl = nextDataUrl.split('?')[0].replace(/\/$/, '');
424 log.info(` 📦 ${cleanRedirectUrl} → ${realPageUrl} (__NEXT_DATA__)`);
425 } else {
426
427 const redirectTarget = (nextDataUrl ?? cleanRedirectUrl).split('?')[0];
428 let capturedLoc: string | null = null;
429 page.on('response', (r) => {
430 if (r.status() >= 300 && r.status() < 400) {
431 const loc = r.headers()['location'];
432 if (loc && !loc.includes('producthunt.com') && !capturedLoc) {
433 try {
434 const abs = loc.startsWith('http') ? loc : new URL(loc, redirectTarget).href;
435 capturedLoc = abs.split('?')[0].replace(/\/$/, '');
436 } catch {}
437 }
438 }
439 });
440 await page.goto(redirectTarget, { waitUntil: 'networkidle', timeout: 20000 });
441 const navUrl = page.url().split('?')[0].replace(/\/$/, '');
442 const resolved = capturedLoc || (navUrl && !navUrl.includes('producthunt.com') ? navUrl : null);
443 if (resolved) {
444 realPageUrl = resolved;
445 log.info(` 🍪 ${cleanRedirectUrl} → ${realPageUrl} (cookie+redirect)`);
446 } else {
447 log.info(` ⚠️ Could not resolve ${cleanRedirectUrl} — staying on PH`);
448 }
449 }
450 } catch (e: any) {
451 log.debug(` URL resolution via PH page failed: ${e.message?.slice(0,80)}`);
452 }
453 }
454
455 resolvedUrl = realPageUrl;
456
457
458 if (!realPageUrl || realPageUrl.includes('producthunt.com')) {
459 return { emails: [], resolvedUrl, emailSource: 'none' };
460 }
461
462 let parsedReal: URL;
463 try { parsedReal = new URL(realPageUrl); }
464 catch { return { emails: [], resolvedUrl, emailSource: 'none' }; }
465 origin = parsedReal.origin;
466 baseDomain = parsedReal.hostname.replace(/^www\./, '');
467
468
469 const rootDomain = baseDomain.split('.').slice(-2).join('.');
470 if (SKIP_DOMAINS.has(rootDomain) || SKIP_DOMAINS.has(baseDomain)) {
471 log.debug(` [skip-domain] ${baseDomain} is in skip list — no email search`);
472 return { emails: [], resolvedUrl, emailSource: 'none' };
473 }
474
475
476 const { emails: hunterEmails } = await tryHunterIo(baseDomain);
477 if (hunterEmails.length > 0) {
478 log.info(` 🎯 Hunter.io → ${hunterEmails.length} email(s) for ${baseDomain}`);
479 return { emails: hunterEmails, resolvedUrl, emailSource: 'hunter_io' };
480 }
481
482
483
484 const currentPageUrl = page.url().split('?')[0].replace(/\/$/, '');
485 const alreadyAtRealSite = currentPageUrl.startsWith(origin);
486 if (!alreadyAtRealSite) {
487 try {
488 await page.goto(realPageUrl, { waitUntil: 'domcontentloaded', timeout: 15000 });
489 await page.waitForTimeout(1500);
490 } catch (e: any) {
491 const tag = e.message?.includes('Timeout') ? '[timeout]' : '[network]';
492 log.debug(` ${tag} Could not load ${realPageUrl}: ${e.message?.slice(0,80)}`);
493 return { emails: [], resolvedUrl, emailSource: 'none' };
494 }
495 }
496
497 pagesVisited = 1;
498
499
500 const landingEmails = await page.evaluate(browserExtractEmails, baseDomain);
501 if (landingEmails.length > 0) return { emails: landingEmails, resolvedUrl, emailSource: 'page_scrape' };
502
503
504 type QueueEntry = { url: string; priority: number };
505 const visited = new Set<string>([realPageUrl, rawWebsiteUrl]);
506 const queue: QueueEntry[] = [];
507
508 const landingLinks = await page.evaluate(browserExtractInternalLinks, origin);
509 for (const link of landingLinks) {
510 if (!visited.has(link)) {
511 const path = link.replace(origin, '') || '/';
512 queue.push({ url: link, priority: urlPriority(path) });
513 }
514 }
515
516
517 for (const path of SEED_PATHS) {
518 const url = `${origin}${path}`;
519 if (!visited.has(url) && !queue.some(q => q.url === url)) {
520 queue.push({ url, priority: urlPriority(path) });
521 }
522 }
523
524
525 queue.sort((a, b) => b.priority - a.priority);
526
527
528 while (queue.length > 0 && pagesVisited < MAX_PAGES_PER_DOMAIN) {
529 const entry = queue.shift()!;
530 if (visited.has(entry.url)) continue;
531 visited.add(entry.url);
532 pagesVisited++;
533
534 try {
535 const response = await page.goto(entry.url, {
536 waitUntil: 'domcontentloaded',
537 timeout: 14000,
538 });
539
540 const ct = response?.headers()['content-type'] ?? '';
541 if (!ct.includes('text/') && !ct.includes('html')) continue;
542
543
544 await page.waitForTimeout(1500);
545
546 const found = await page.evaluate(browserExtractEmails, baseDomain);
547 if (found.length > 0) {
548 emails = found;
549 break;
550 }
551
552
553 if (pagesVisited < MAX_PAGES_PER_DOMAIN) {
554 const links = await page.evaluate(browserExtractInternalLinks, origin);
555 for (const link of links) {
556 if (!visited.has(link) && !queue.some(q => q.url === link)) {
557 const path = link.replace(origin, '') || '/';
558 queue.push({ url: link, priority: urlPriority(path) });
559 }
560 }
561 queue.sort((a, b) => b.priority - a.priority);
562 }
563 } catch { continue; }
564 }
565 } catch (err: any) {
566 const tag = err.message?.includes('Timeout') ? '[timeout]'
567 : err.message?.includes('net::ERR') ? '[network]'
568 : '[error]';
569 log.debug(` ${tag} Navigation failed for ${rawWebsiteUrl}: ${err.message?.slice(0, 100)}`);
570 } finally {
571 await context.close();
572 }
573
574 const emailSource: EmailSource = emails.length > 0 ? 'page_scrape' : 'none';
575 log.debug(` crawled ${pagesVisited} page(s) for ${baseDomain || rawWebsiteUrl} → ${emailSource}`);
576 return { emails, resolvedUrl, emailSource };
577}
578
579
580function toInt(v: unknown): number | null {
581 if (v === null || v === undefined || v === '') return null;
582 const n = parseInt(String(v), 10);
583 return isNaN(n) ? null : n;
584}
585function toFloat(v: unknown): number | null {
586 if (v === null || v === undefined || v === '') return null;
587 const n = parseFloat(String(v));
588 return isNaN(n) ? null : n;
589}
590
591
592function nodeToRecord(node: any, rank: number): Record<string, unknown> {
593 const makers = (node.makers ?? []).map((m: any) => ({
594 maker_name: m.name ?? null,
595 maker_id: m.username ?? null,
596 maker_ph_id: m.id ?? null,
597 twitter_url: m.twitterUsername ? `https://twitter.com/${m.twitterUsername}` : null,
598 maker_url: m.url ?? null,
599 }));
600 const topics = (node.topics?.edges ?? []).map((e: any) => e?.node?.name).filter(Boolean);
601 return {
602 product_name: node.name ? node.name.split(/[-:—]/)[0].trim() : null,
603 tagline: node.tagline ?? null,
604 description: node.description ?? null,
605 upvote_count: toInt(node.votesCount),
606 comment_count: toInt(node.commentsCount),
607 reviews_count: toInt(node.reviewsCount),
608 reviews_rating: toFloat(node.reviewsRating),
609 daily_rank: toInt(node.dailyRank) ?? rank,
610 weekly_rank: toInt(node.weeklyRank),
611 monthly_rank: toInt(node.monthlyRank),
612 yearly_rank: toInt(node.yearlyRank),
613 launch_date: node.featuredAt ? new Date(node.featuredAt).toISOString().split('T')[0]
614 : node.createdAt ? new Date(node.createdAt).toISOString().split('T')[0] : null,
615 product_hunt_url: node.url ?? (node.slug ? `https://www.producthunt.com/posts/${node.slug}` : null),
616 website_url: node.website ?? null,
617 topics,
618 thumbnail_url: node.thumbnail?.url ?? null,
619 makers,
620 featured: !!node.featuredAt,
621 emails: [] as string[],
622 email_source: 'none' as string,
623 scraped_at: new Date().toISOString(),
624 };
625}
626
627
628
629
630
631const LEAN_FIELDS = new Set([
632 'product_name', 'tagline', 'website_url', 'upvote_count', 'daily_rank',
633 'emails', 'email_source', 'topics', 'launch_date', 'product_hunt_url',
634 'featured', 'scraped_at',
635]);
636
637function applyOutputMode(
638 record: Record<string, unknown>,
639 mode: 'full' | 'lean' | 'leads',
640): Record<string, unknown> | null {
641 if (mode === 'leads' && (record.emails as string[]).length === 0) return null;
642 if (mode === 'lean') {
643 const out: Record<string, unknown> = {};
644 for (const k of LEAN_FIELDS) out[k] = record[k] ?? null;
645 return out;
646 }
647 return record;
648}
649
650
651const dateRange = buildDateRange();
652
653function buildVars(after: string | null, remaining: number) {
654 const pageSize = Math.min(20, remaining);
655 if (mode === 'search') return { query: searchQuery, first: pageSize, after };
656 const vars: Record<string, unknown> = { first: pageSize, after, order: 'VOTES' };
657 if (!allProducts) vars.featured = true;
658 if (dateRange) { vars.postedAfter = dateRange.postedAfter; vars.postedBefore = dateRange.postedBefore; }
659 if (mode === 'topic') vars.topic = topic;
660 return vars;
661}
662
663
664let browser: Browser | null = null;
665async function ensureBrowser(): Promise<Browser> {
666 if (!browser) {
667 browser = await chromium.launch({
668 headless: true,
669 args: [
670 '--no-sandbox',
671 '--disable-setuid-sandbox',
672 '--disable-dev-shm-usage',
673 '--disable-accelerated-2d-canvas',
674 '--disable-gpu',
675 '--no-zygote',
676 '--disable-extensions',
677 '--disable-background-networking',
678 '--disable-default-apps',
679 '--disable-blink-features=AutomationControlled',
680 ],
681 });
682 log.info('Playwright browser launched.');
683 }
684 return browser;
685}
686
687
688async function processNode(node: any, rank: number): Promise<void> {
689 const record = nodeToRecord(node, rank);
690
691 const rawUrl = typeof record.website_url === 'string' ? record.website_url : '';
692 const phProductUrl = typeof record.product_hunt_url === 'string'
693 ? record.product_hunt_url.split('?')[0]
694 : undefined;
695
696 if (enrichEmails && rawUrl.startsWith('http')) {
697 const br = await ensureBrowser();
698 try {
699 const { emails, resolvedUrl, emailSource } = await findEmails(rawUrl, br, phProductUrl);
700 record.website_url = resolvedUrl;
701 record.emails = emails;
702 record.email_source = emailSource;
703 if (emails.length > 0) {
704 log.info(` ✉ ${record.product_name}: ${emails.length} email(s) [${emailSource}] → ${emails.join(', ')}`);
705 }
706 } catch (err: any) {
707 const msg = err.message ?? '';
708 const tag = msg.includes('Timeout') || msg.includes('timeout') ? '[timeout]'
709 : msg.includes('net::ERR') || msg.includes('ECONNREFUSED') ? '[network]'
710 : msg.includes('Navigation') ? '[navigation]'
711 : '[error]';
712 log.warning(` Email enrichment failed ${tag} for ${record.product_name}: ${msg.slice(0, 100)}`);
713 record.website_url = await resolveToRealUrl(rawUrl);
714 }
715 } else if (rawUrl.startsWith('http')) {
716 record.website_url = await resolveToRealUrl(rawUrl);
717 }
718
719 const outputRecord = applyOutputMode(record, outputMode);
720 if (outputRecord !== null) {
721 await Actor.pushData(outputRecord);
722 await Actor.charge({ eventName: 'product-scraped', count: 1 });
723 }
724}
725
726
727
728
729
730
731log.info('Fetching products from ProductHunt API (streaming)...', {
732 dateRange: dateRange ?? 'no date filter', allProducts, maxResults,
733});
734
735const seenIds = new Set<string>();
736let cursor: string | null = null;
737let pageNum = 1;
738let totalCollected = 0;
739let totalProcessed = 0;
740const batchSize = enrichEmails ? Math.min(maxConcurrency, 3) : maxConcurrency;
741
742while (totalCollected < maxResults) {
743 log.info(`Page ${pageNum} (collected ${totalCollected}/${maxResults})...`);
744 let data: any;
745 try {
746 data = await phQuery(
747 mode === 'search' ? SEARCH_QUERY : POSTS_QUERY,
748 buildVars(cursor, maxResults - totalCollected),
749 );
750 } catch (err: any) {
751 log.error(`PH API error on page ${pageNum}: ${err.message}`);
752 break;
753 }
754
755 const edges = data?.posts?.edges ?? [];
756 const pageInfo = data?.posts?.pageInfo ?? {};
757 if (!edges.length) { log.info('No more results from PH API.'); break; }
758
759
760 const pageNodes: any[] = [];
761 for (const { node } of edges) {
762 if (totalCollected >= maxResults) break;
763 if (!node?.id || seenIds.has(node.id)) continue;
764 seenIds.add(node.id);
765 pageNodes.push(node);
766 totalCollected++;
767 }
768 log.info(`Page ${pageNum}: +${pageNodes.length} products (total: ${totalCollected})`);
769
770
771 for (let i = 0; i < pageNodes.length; i += batchSize) {
772 const batch = pageNodes.slice(i, i + batchSize);
773 await Promise.all(batch.map((node, j) => processNode(node, totalProcessed + i + j + 1)));
774 }
775 totalProcessed += pageNodes.length;
776 if (totalProcessed % 50 === 0 || !pageInfo.hasNextPage) {
777 log.info(`✓ Pushed ${totalProcessed} products to dataset so far`);
778 }
779
780 if (!pageInfo.hasNextPage || !pageInfo.endCursor) { log.info('No more pages.'); break; }
781 cursor = pageInfo.endCursor;
782 pageNum++;
783}
784
785
786if (browser) {
787 await browser.close();
788 log.info('Playwright browser closed.');
789}
790
791log.info(`Done! ${totalProcessed} products scraped and pushed.`);
792await Actor.exit();