1import "dotenv/config";
2import { Actor, log } from "apify";
3
4interface Input {
5 query?: string;
6 tag?: string;
7 author?: string;
8 sortBy?: string;
9 dateAfter?: string;
10 dateBefore?: string;
11 minPoints?: number;
12 minComments?: number;
13 maxItems?: number;
14 pageId?: string;
15}
16
17interface AlgoliaHit {
18 objectID: string;
19 created_at?: string;
20 created_at_i?: number;
21 author?: string;
22 title?: string | null;
23 url?: string | null;
24 story_text?: string | null;
25 comment_text?: string | null;
26 story_id?: number | null;
27 story_title?: string | null;
28 story_url?: string | null;
29 parent_id?: number | null;
30 points?: number | null;
31 num_comments?: number | null;
32 _tags?: string[];
33}
34
35interface AlgoliaResponse {
36 hits: AlgoliaHit[];
37 nbHits?: number;
38 page?: number;
39 nbPages?: number;
40 hitsPerPage?: number;
41}
42
43interface ItemRecord {
44 objectID: string;
45 type: string | null;
46 title: string | null;
47 url: string | null;
48 author: string | null;
49 points: number | null;
50 numComments: number | null;
51 storyText: string | null;
52 commentText: string | null;
53 storyId: number | null;
54 storyTitle: string | null;
55 storyUrl: string | null;
56 parentId: number | null;
57 createdAt: string | null;
58 createdAtUnix: number | null;
59 hnUrl: string;
60 tags: string[];
61}
62
63const NEXT_PAGE_ID_KEY = "NEXT_PAGE_ID";
64const VALID_TAGS = new Set(["any", "story", "comment", "show_hn", "ask_hn", "poll", "job", "front_page"]);
65const VALID_SORT = new Set(["relevance", "date"]);
66const TYPE_TAGS = ["story", "comment", "show_hn", "ask_hn", "poll", "pollopt", "job"];
67const PAGE_SIZE = 100;
68const MAX_PAGES = 1000;
69
70function deriveType(tags: string[]): string | null {
71 for (const t of tags) {
72 if (TYPE_TAGS.includes(t)) return t;
73 }
74 return null;
75}
76
77function parseDate(raw: string, endOfDay: boolean): number | null {
78 const s = raw.trim();
79 if (!s) return null;
80 const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(s);
81 if (!m) return null;
82 const y = parseInt(m[1], 10);
83 const mo = parseInt(m[2], 10) - 1;
84 const d = parseInt(m[3], 10);
85 const ts = endOfDay
86 ? Date.UTC(y, mo, d, 23, 59, 59)
87 : Date.UTC(y, mo, d, 0, 0, 0);
88 if (Number.isNaN(ts)) return null;
89 return Math.floor(ts / 1000);
90}
91
92function buildTagsFilter(tag: string, author: string): string | null {
93 const parts: string[] = [];
94 if (tag !== "any") parts.push(tag);
95 if (author) parts.push(`author_${author}`);
96 return parts.length > 0 ? parts.join(",") : null;
97}
98
99function buildNumericFilters(
100 afterUnix: number | null,
101 beforeUnix: number | null,
102 minPoints: number,
103 minComments: number,
104): string | null {
105 const parts: string[] = [];
106 if (afterUnix !== null) parts.push(`created_at_i>=${afterUnix}`);
107 if (beforeUnix !== null) parts.push(`created_at_i<=${beforeUnix}`);
108 if (minPoints > 0) parts.push(`points>=${minPoints}`);
109 if (minComments > 0) parts.push(`num_comments>=${minComments}`);
110 return parts.length > 0 ? parts.join(",") : null;
111}
112
113function mapHit(h: AlgoliaHit): ItemRecord {
114 const tags = h._tags ?? [];
115 return {
116 objectID: h.objectID,
117 type: deriveType(tags),
118 title: h.title ?? null,
119 url: h.url ?? null,
120 author: h.author ?? null,
121 points: h.points ?? null,
122 numComments: h.num_comments ?? null,
123 storyText: h.story_text ?? null,
124 commentText: h.comment_text ?? null,
125 storyId: h.story_id ?? null,
126 storyTitle: h.story_title ?? null,
127 storyUrl: h.story_url ?? null,
128 parentId: h.parent_id ?? null,
129 createdAt: h.created_at ?? null,
130 createdAtUnix: h.created_at_i ?? null,
131 hnUrl: `https://news.ycombinator.com/item?id=${h.objectID}`,
132 tags,
133 };
134}
135
136async function fetchPage(
137 endpoint: "search" | "search_by_date",
138 query: string,
139 tagsFilter: string | null,
140 numericFilters: string | null,
141 page: number,
142): Promise<AlgoliaResponse> {
143 const url = new URL(`https://hn.algolia.com/api/v1/${endpoint}`);
144 if (query) url.searchParams.set("query", query);
145 if (tagsFilter) url.searchParams.set("tags", tagsFilter);
146 if (numericFilters) url.searchParams.set("numericFilters", numericFilters);
147 url.searchParams.set("hitsPerPage", String(PAGE_SIZE));
148 url.searchParams.set("page", String(page));
149
150 const response = await fetch(url, {
151 headers: { accept: "application/json" },
152 signal: AbortSignal.timeout(30_000),
153 });
154
155 if (!response.ok) {
156 throw new Error(`Algolia ${response.status}: ${await response.text().catch(() => "")}`);
157 }
158
159 return (await response.json()) as AlgoliaResponse;
160}
161
162await Actor.init();
163
164const input = (await Actor.getInput<Input>()) ?? {};
165
166const query = (input.query ?? "").trim();
167const tag = (input.tag ?? "story").toLowerCase();
168if (!VALID_TAGS.has(tag)) {
169 await Actor.fail(`Input 'tag' must be one of: ${[...VALID_TAGS].join(", ")}.`);
170 process.exit(1);
171}
172
173const sortBy = (input.sortBy ?? "relevance").toLowerCase();
174if (!VALID_SORT.has(sortBy)) {
175 await Actor.fail(`Input 'sortBy' must be 'relevance' or 'date'.`);
176 process.exit(1);
177}
178
179const author = (input.author ?? "").trim();
180const dateAfterRaw = (input.dateAfter ?? "").trim();
181const dateBeforeRaw = (input.dateBefore ?? "").trim();
182const afterUnix = dateAfterRaw ? parseDate(dateAfterRaw, false) : null;
183const beforeUnix = dateBeforeRaw ? parseDate(dateBeforeRaw, true) : null;
184
185if (dateAfterRaw && afterUnix === null) {
186 await Actor.fail(`Input 'dateAfter' must be in YYYY-MM-DD format (got '${dateAfterRaw}').`);
187 process.exit(1);
188}
189if (dateBeforeRaw && beforeUnix === null) {
190 await Actor.fail(`Input 'dateBefore' must be in YYYY-MM-DD format (got '${dateBeforeRaw}').`);
191 process.exit(1);
192}
193
194const minPoints = Math.max(0, input.minPoints ?? 0);
195const minComments = Math.max(0, input.minComments ?? 0);
196const maxItems = Math.max(0, input.maxItems ?? 100);
197const startPage = input.pageId ? parseInt(input.pageId.trim(), 10) : 0;
198if (Number.isNaN(startPage) || startPage < 0) {
199 await Actor.fail(`Input 'pageId' must be a non-negative integer (got '${input.pageId}').`);
200 process.exit(1);
201}
202
203const endpoint = sortBy === "date" ? "search_by_date" : "search";
204const tagsFilter = buildTagsFilter(tag, author);
205const numericFilters = buildNumericFilters(afterUnix, beforeUnix, minPoints, minComments);
206
207const dataset = await Actor.openDataset<ItemRecord>();
208const timeoutAt = Actor.getEnv().timeoutAt;
209const deadlineMs = timeoutAt ? timeoutAt.getTime() - 60_000 : null;
210
211let pushedTotal = 0;
212let page = startPage;
213let lastGoodPage: number | null = startPage;
214let pages = 0;
215
216log.info(
217 `[hn] query='${query}' tag=${tag}${author ? ` author=${author}` : ""} sort=${sortBy} startPage=${startPage} max=${maxItems || "unlimited"}`,
218);
219
220try {
221 while (true) {
222 if (maxItems > 0 && pushedTotal >= maxItems) break;
223 if (pages >= MAX_PAGES) {
224 log.warning(`[hn] hit MAX_PAGES safety cap.`);
225 break;
226 }
227 if (deadlineMs && Date.now() > deadlineMs) {
228 log.warning(`[hn] approaching actor timeout. Stopping early; resume with NEXT_PAGE_ID.`);
229 break;
230 }
231
232 const data = await fetchPage(endpoint, query, tagsFilter, numericFilters, page);
233 const hits = data.hits ?? [];
234 pages++;
235
236 let pushedThisPage = 0;
237 for (const h of hits) {
238 if (maxItems > 0 && pushedTotal >= maxItems) break;
239 await dataset.pushData(mapHit(h));
240 pushedTotal++;
241 pushedThisPage++;
242 }
243
244 log.info(
245 `[hn] page ${page} (${pages} fetched): +${pushedThisPage} (total ${pushedTotal}/${data.nbHits ?? "?"})`,
246 );
247
248 const nbPages = data.nbPages ?? 0;
249 if (hits.length === 0 || page + 1 >= nbPages) {
250 lastGoodPage = null;
251 break;
252 }
253
254 page++;
255 lastGoodPage = page;
256 }
257} catch (err) {
258 log.error(`[hn] failed: ${(err as Error).message}. Pushed ${pushedTotal} so far.`);
259}
260
261await Actor.setValue(NEXT_PAGE_ID_KEY, lastGoodPage === null ? null : String(lastGoodPage));
262
263log.info(`[hn] done. items=${pushedTotal} ${NEXT_PAGE_ID_KEY}=${lastGoodPage ?? "null"}`);
264
265await Actor.exit();