1import { Actor } from 'apify';
2
3await Actor.init();
4
5const input = await Actor.getInput() ?? {};
6
7const {
8 scrapeType = 'top',
9 searchQueries = [],
10 maxStories = 100,
11 includeComments = true,
12 maxCommentsPerStory = 30,
13 minScore = 0,
14} = input;
15
16const HN_API = 'https://hacker-news.firebaseio.com/v0';
17const HN_SEARCH = 'https://hn.algolia.com/api/v1';
18
19async function fetchJson(url) {
20 const response = await fetch(url);
21 if (!response.ok) throw new Error(`HTTP ${response.status}`);
22 return response.json();
23}
24
25async function getItem(id) {
26 return fetchJson(`${HN_API}/item/${id}.json`);
27}
28
29async function getStoryIds(type) {
30 const typeMap = {
31 'top': 'topstories',
32 'new': 'newstories',
33 'best': 'beststories',
34 'ask': 'askstories',
35 'show': 'showstories',
36 'job': 'jobstories',
37 };
38 return fetchJson(`${HN_API}/${typeMap[type] || 'topstories'}.json`);
39}
40
41async function searchStories(query, page = 0) {
42 return fetchJson(`${HN_SEARCH}/search?query=${encodeURIComponent(query)}&tags=story&hitsPerPage=50&page=${page}`);
43}
44
45async function getComments(storyItem, maxComments) {
46 const comments = [];
47 const commentIds = storyItem.kids || [];
48
49 async function fetchComment(id, depth = 0) {
50 if (comments.length >= maxComments) return;
51 try {
52 const item = await getItem(id);
53 if (!item || item.deleted || item.dead) return;
54
55 comments.push({
56 id: item.id,
57 author: item.by || '[deleted]',
58 text: item.text || '',
59 time: new Date(item.time * 1000).toISOString(),
60 depth,
61 childCount: (item.kids || []).length,
62 });
63
64
65 if (depth < 3 && item.kids) {
66 for (const childId of item.kids.slice(0, 5)) {
67 if (comments.length >= maxComments) break;
68 await fetchComment(childId, depth + 1);
69 }
70 }
71 } catch (e) {
72
73 }
74 }
75
76
77 for (let i = 0; i < commentIds.length; i += 5) {
78 if (comments.length >= maxComments) break;
79 const batch = commentIds.slice(i, i + 5);
80 await Promise.all(batch.map(id => fetchComment(id, 0)));
81 }
82
83 return comments;
84}
85
86try {
87
88 if (scrapeType !== 'search') {
89 console.log(`Fetching ${scrapeType} stories...`);
90 const storyIds = await getStoryIds(scrapeType);
91 const idsToProcess = storyIds.slice(0, maxStories * 2);
92
93 let collected = 0;
94
95
96 for (let i = 0; i < idsToProcess.length && collected < maxStories; i += 10) {
97 const batch = idsToProcess.slice(i, i + 10);
98 const items = await Promise.all(batch.map(id => getItem(id).catch(() => null)));
99
100 for (const item of items) {
101 if (!item || collected >= maxStories) continue;
102 if (item.score < minScore) continue;
103
104 const story = {
105 id: item.id,
106 title: item.title,
107 url: item.url || null,
108 author: item.by,
109 score: item.score,
110 commentCount: item.descendants || 0,
111 time: new Date(item.time * 1000).toISOString(),
112 type: item.type,
113 text: item.text || null,
114 hnUrl: `https://news.ycombinator.com/item?id=${item.id}`,
115 domain: item.url ? new URL(item.url).hostname : null,
116 source: scrapeType,
117 scrapedAt: new Date().toISOString(),
118 };
119
120 if (includeComments && item.kids && item.kids.length > 0) {
121 story.comments = await getComments(item, maxCommentsPerStory);
122 }
123
124 await Actor.pushData(story);
125 collected++;
126
127 if (collected % 10 === 0) {
128 console.log(`Processed ${collected}/${maxStories} stories...`);
129 }
130 }
131 }
132
133 console.log(`\nCollected ${collected} ${scrapeType} stories.`);
134 }
135
136
137 for (const query of searchQueries) {
138 console.log(`\nSearching HN for: "${query}"`);
139
140 let collected = 0;
141 let page = 0;
142
143 while (collected < maxStories) {
144 const results = await searchStories(query, page);
145 const hits = results.hits || [];
146
147 if (hits.length === 0) break;
148
149 for (const hit of hits) {
150 if (collected >= maxStories) break;
151 if (hit.points < minScore) continue;
152
153 const story = {
154 id: parseInt(hit.objectID),
155 title: hit.title,
156 url: hit.url || null,
157 author: hit.author,
158 score: hit.points,
159 commentCount: hit.num_comments || 0,
160 time: new Date(hit.created_at).toISOString(),
161 hnUrl: `https://news.ycombinator.com/item?id=${hit.objectID}`,
162 domain: hit.url ? new URL(hit.url).hostname : null,
163 source: `search:${query}`,
164 tags: hit._tags || [],
165 scrapedAt: new Date().toISOString(),
166 };
167
168
169 if (includeComments && hit.num_comments > 0) {
170 try {
171 const fullItem = await getItem(parseInt(hit.objectID));
172 if (fullItem && fullItem.kids) {
173 story.comments = await getComments(fullItem, maxCommentsPerStory);
174 }
175 } catch (e) {
176 story.comments = [];
177 }
178 }
179
180 await Actor.pushData(story);
181 collected++;
182 }
183
184 page++;
185 if (page >= results.nbPages) break;
186 }
187
188 console.log(`Collected ${collected} stories for "${query}"`);
189 }
190} catch (e) {
191 console.error(`Error: ${e.message}`);
192}
193
194await Actor.exit();