1import "dotenv/config";
2import { Actor, log } from "apify";
3import { withAnalytics } from "@apifyhub/analytics";
4
5interface Input {
6 query?: string;
7 tag?: string;
8 author?: string;
9 sortBy?: string;
10 dateAfter?: string;
11 dateBefore?: string;
12 minPoints?: number;
13 minComments?: number;
14 maxItems?: number;
15 pageId?: string;
16}
17
18interface AlgoliaHit {
19 objectID: string;
20 created_at?: string;
21 created_at_i?: number;
22 author?: string;
23 title?: string | null;
24 url?: string | null;
25 story_text?: string | null;
26 comment_text?: string | null;
27 story_id?: number | null;
28 story_title?: string | null;
29 story_url?: string | null;
30 parent_id?: number | null;
31 points?: number | null;
32 num_comments?: number | null;
33 _tags?: string[];
34}
35
36interface AlgoliaResponse {
37 hits: AlgoliaHit[];
38 nbHits?: number;
39 page?: number;
40 nbPages?: number;
41 hitsPerPage?: number;
42}
43
44interface ItemRecord {
45 objectID: string;
46 type: string | null;
47 title: string | null;
48 url: string | null;
49 author: string | null;
50 points: number | null;
51 numComments: number | null;
52 storyText: string | null;
53 commentText: string | null;
54 storyId: number | null;
55 storyTitle: string | null;
56 storyUrl: string | null;
57 parentId: number | null;
58 createdAt: string | null;
59 createdAtUnix: number | null;
60 hnUrl: string;
61 tags: string[];
62}
63
64const NEXT_PAGE_ID_KEY = "NEXT_PAGE_ID";
65const VALID_TAGS = new Set(["any", "story", "comment", "show_hn", "ask_hn", "poll", "job", "front_page"]);
66const VALID_SORT = new Set(["relevance", "date"]);
67const TYPE_TAGS = ["story", "comment", "show_hn", "ask_hn", "poll", "pollopt", "job"];
68const PAGE_SIZE = 100;
69const MAX_PAGES = 1000;
70
71function deriveType(tags: string[]): string | null {
72 for (const t of tags) {
73 if (TYPE_TAGS.includes(t)) return t;
74 }
75 return null;
76}
77
78function parseDate(raw: string, endOfDay: boolean): number | null {
79 const s = raw.trim();
80 if (!s) return null;
81 const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(s);
82 if (!m) return null;
83 const y = parseInt(m[1], 10);
84 const mo = parseInt(m[2], 10) - 1;
85 const d = parseInt(m[3], 10);
86 const ts = endOfDay
87 ? Date.UTC(y, mo, d, 23, 59, 59)
88 : Date.UTC(y, mo, d, 0, 0, 0);
89 if (Number.isNaN(ts)) return null;
90 return Math.floor(ts / 1000);
91}
92
93function buildTagsFilter(tag: string, author: string): string | null {
94 const parts: string[] = [];
95 if (tag !== "any") parts.push(tag);
96 if (author) parts.push(`author_${author}`);
97 return parts.length > 0 ? parts.join(",") : null;
98}
99
100function buildNumericFilters(
101 afterUnix: number | null,
102 beforeUnix: number | null,
103 minPoints: number,
104 minComments: number,
105): string | null {
106 const parts: string[] = [];
107 if (afterUnix !== null) parts.push(`created_at_i>=${afterUnix}`);
108 if (beforeUnix !== null) parts.push(`created_at_i<=${beforeUnix}`);
109 if (minPoints > 0) parts.push(`points>=${minPoints}`);
110 if (minComments > 0) parts.push(`num_comments>=${minComments}`);
111 return parts.length > 0 ? parts.join(",") : null;
112}
113
114function mapHit(h: AlgoliaHit): ItemRecord {
115 const tags = h._tags ?? [];
116 return {
117 objectID: h.objectID,
118 type: deriveType(tags),
119 title: h.title ?? null,
120 url: h.url ?? null,
121 author: h.author ?? null,
122 points: h.points ?? null,
123 numComments: h.num_comments ?? null,
124 storyText: h.story_text ?? null,
125 commentText: h.comment_text ?? null,
126 storyId: h.story_id ?? null,
127 storyTitle: h.story_title ?? null,
128 storyUrl: h.story_url ?? null,
129 parentId: h.parent_id ?? null,
130 createdAt: h.created_at ?? null,
131 createdAtUnix: h.created_at_i ?? null,
132 hnUrl: `https://news.ycombinator.com/item?id=${h.objectID}`,
133 tags,
134 };
135}
136
137async function fetchPage(
138 endpoint: "search" | "search_by_date",
139 query: string,
140 tagsFilter: string | null,
141 numericFilters: string | null,
142 page: number,
143): Promise<AlgoliaResponse> {
144 const url = new URL(`https://hn.algolia.com/api/v1/${endpoint}`);
145 if (query) url.searchParams.set("query", query);
146 if (tagsFilter) url.searchParams.set("tags", tagsFilter);
147 if (numericFilters) url.searchParams.set("numericFilters", numericFilters);
148 url.searchParams.set("hitsPerPage", String(PAGE_SIZE));
149 url.searchParams.set("page", String(page));
150
151 const response = await fetch(url, {
152 headers: { accept: "application/json" },
153 signal: AbortSignal.timeout(30_000),
154 });
155
156 if (!response.ok) {
157 throw new Error(`Algolia ${response.status}: ${await response.text().catch(() => "")}`);
158 }
159
160 return (await response.json()) as AlgoliaResponse;
161}
162
163await Actor.init();
164
165const input = (await Actor.getInput<Input>()) ?? {};
166
167const query = (input.query ?? "").trim();
168const tag = (input.tag ?? "story").toLowerCase();
169if (!VALID_TAGS.has(tag)) {
170 await Actor.fail(`Input 'tag' must be one of: ${[...VALID_TAGS].join(", ")}.`);
171 process.exit(1);
172}
173
174const sortBy = (input.sortBy ?? "relevance").toLowerCase();
175if (!VALID_SORT.has(sortBy)) {
176 await Actor.fail(`Input 'sortBy' must be 'relevance' or 'date'.`);
177 process.exit(1);
178}
179
180const author = (input.author ?? "").trim();
181const dateAfterRaw = (input.dateAfter ?? "").trim();
182const dateBeforeRaw = (input.dateBefore ?? "").trim();
183const afterUnix = dateAfterRaw ? parseDate(dateAfterRaw, false) : null;
184const beforeUnix = dateBeforeRaw ? parseDate(dateBeforeRaw, true) : null;
185
186if (dateAfterRaw && afterUnix === null) {
187 await Actor.fail(`Input 'dateAfter' must be in YYYY-MM-DD format (got '${dateAfterRaw}').`);
188 process.exit(1);
189}
190if (dateBeforeRaw && beforeUnix === null) {
191 await Actor.fail(`Input 'dateBefore' must be in YYYY-MM-DD format (got '${dateBeforeRaw}').`);
192 process.exit(1);
193}
194
195const minPoints = Math.max(0, input.minPoints ?? 0);
196const minComments = Math.max(0, input.minComments ?? 0);
197const maxItems = Math.max(0, input.maxItems ?? 100);
198const startPage = input.pageId ? parseInt(input.pageId.trim(), 10) : 0;
199if (Number.isNaN(startPage) || startPage < 0) {
200 await Actor.fail(`Input 'pageId' must be a non-negative integer (got '${input.pageId}').`);
201 process.exit(1);
202}
203
204await withAnalytics({ apifyHubKey: process.env.APIFYHUB_KEY ?? "" }, async () => {
205 const endpoint = sortBy === "date" ? "search_by_date" : "search";
206 const tagsFilter = buildTagsFilter(tag, author);
207 const numericFilters = buildNumericFilters(afterUnix, beforeUnix, minPoints, minComments);
208
209 const dataset = await Actor.openDataset<ItemRecord>();
210 const timeoutAt = Actor.getEnv().timeoutAt;
211 const deadlineMs = timeoutAt ? timeoutAt.getTime() - 60_000 : null;
212
213 let pushedTotal = 0;
214 let page = startPage;
215 let lastGoodPage: number | null = startPage;
216 let pages = 0;
217
218 log.info(
219 `[hn] query='${query}' tag=${tag}${author ? ` author=${author}` : ""} sort=${sortBy} startPage=${startPage} max=${maxItems || "unlimited"}`,
220 );
221
222 try {
223 while (true) {
224 if (maxItems > 0 && pushedTotal >= maxItems) break;
225 if (pages >= MAX_PAGES) {
226 log.warning(`[hn] hit MAX_PAGES safety cap.`);
227 break;
228 }
229 if (deadlineMs && Date.now() > deadlineMs) {
230 log.warning(`[hn] approaching actor timeout. Stopping early; resume with NEXT_PAGE_ID.`);
231 break;
232 }
233
234 const data = await fetchPage(endpoint, query, tagsFilter, numericFilters, page);
235 const hits = data.hits ?? [];
236 pages++;
237
238 let pushedThisPage = 0;
239 for (const h of hits) {
240 if (maxItems > 0 && pushedTotal >= maxItems) break;
241 await dataset.pushData(mapHit(h));
242 pushedTotal++;
243 pushedThisPage++;
244 }
245
246 log.info(
247 `[hn] page ${page} (${pages} fetched): +${pushedThisPage} (total ${pushedTotal}/${data.nbHits ?? "?"})`,
248 );
249
250 const nbPages = data.nbPages ?? 0;
251 if (hits.length === 0 || page + 1 >= nbPages) {
252 lastGoodPage = null;
253 break;
254 }
255
256 page++;
257 lastGoodPage = page;
258 }
259 } catch (err) {
260 log.error(`[hn] failed: ${(err as Error).message}. Pushed ${pushedTotal} so far.`);
261 }
262
263 await Actor.setValue(NEXT_PAGE_ID_KEY, lastGoodPage === null ? null : String(lastGoodPage));
264
265 log.info(`[hn] done. items=${pushedTotal} ${NEXT_PAGE_ID_KEY}=${lastGoodPage ?? "null"}`);
266
267});
268
269await Actor.exit();