1"""Hacker News Search & Monitor — Apify Actor.
2
3Uses the free, no-auth Algolia Hacker News Search API:
4 https://hn.algolia.com/api/v1/search (sorted by relevance: points & comments)
5 https://hn.algolia.com/api/v1/search_by_date (sorted by date: most recent first)
6
7API contract verified against a live response. Relevant fields per hit:
8 objectID, title, url, author, points, num_comments, created_at,
9 created_at_i, story_id, story_title, story_url, comment_text, _tags
10Top-level: hits, nbHits, nbPages, page, hitsPerPage
11"""
12
13from __future__ import annotations
14
15import asyncio
16from datetime import datetime, timezone
17from urllib.parse import urlencode
18
19import httpx
20from apify import Actor
21
22BASE_RELEVANCE = "https://hn.algolia.com/api/v1/search"
23BASE_DATE = "https://hn.algolia.com/api/v1/search_by_date"
24HN_ITEM_URL = "https://news.ycombinator.com/item?id={}"
25HN_USER_URL = "https://news.ycombinator.com/user?id={}"
26PAGE_SIZE = 100
27
28
29CONTENT_TYPE_TAGS = {
30 "story": "story",
31 "comment": "comment",
32 "ask_hn": "ask_hn",
33 "show_hn": "show_hn",
34 "poll": "poll",
35 "job": "job",
36 "all": "(story,comment)",
37}
38
39
40def _date_to_timestamp(date_str: str) -> int | None:
41 """Parse a YYYY-MM-DD string into a unix timestamp (UTC). Returns None if unparseable."""
42 if not date_str:
43 return None
44 for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y"):
45 try:
46 dt = datetime.strptime(date_str.strip(), fmt).replace(tzinfo=timezone.utc)
47 return int(dt.timestamp())
48 except ValueError:
49 continue
50 return None
51
52
53def build_tags(content_type: str, author: str | None) -> str:
54 """Combine content-type tag with an optional author filter (ANDed)."""
55 base = CONTENT_TYPE_TAGS.get(content_type, "story")
56 if author:
57 author_tag = f"author_{author.strip()}"
58
59 return f"{base},{author_tag}"
60 return base
61
62
63def build_numeric_filters(
64 min_points: int | None,
65 min_comments: int | None,
66 created_after_ts: int | None,
67 created_before_ts: int | None,
68) -> str:
69 """Build the Algolia numericFilters string from the active constraints."""
70 filters: list[str] = []
71 if min_points is not None:
72 filters.append(f"points>={min_points}")
73 if min_comments is not None:
74 filters.append(f"num_comments>={min_comments}")
75 if created_after_ts is not None:
76 filters.append(f"created_at_i>={created_after_ts}")
77 if created_before_ts is not None:
78 filters.append(f"created_at_i<{created_before_ts}")
79 return ",".join(filters)
80
81
82def transform_hit(hit: dict, include_text: bool) -> dict:
83 """Convert a raw Algolia hit into a clean, stable output record.
84
85 Pure function — no I/O — so it can be unit-tested against captured fixtures.
86 """
87 tags = hit.get("_tags", []) or []
88
89 if "comment" in tags:
90 item_type = "comment"
91 elif "poll" in tags:
92 item_type = "poll"
93 elif "job" in tags:
94 item_type = "job"
95 elif "show_hn" in tags:
96 item_type = "show_hn"
97 elif "ask_hn" in tags:
98 item_type = "ask_hn"
99 else:
100 item_type = "story"
101
102 object_id = hit.get("objectID")
103
104 record = {
105 "id": object_id,
106 "type": item_type,
107 "title": hit.get("title") or hit.get("story_title"),
108 "url": hit.get("url") or hit.get("story_url"),
109 "author": hit.get("author"),
110 "points": hit.get("points"),
111 "numComments": hit.get("num_comments"),
112 "createdAt": hit.get("created_at"),
113 "createdAtTimestamp": hit.get("created_at_i"),
114 "storyId": hit.get("story_id"),
115 "hnUrl": HN_ITEM_URL.format(object_id) if object_id else None,
116 "authorUrl": HN_USER_URL.format(hit["author"]) if hit.get("author") else None,
117 }
118
119 if include_text:
120
121 record["text"] = hit.get("comment_text") or hit.get("story_text")
122
123 return record
124
125
126async def main() -> None:
127 async with Actor:
128 actor_input = await Actor.get_input() or {}
129
130 query = (actor_input.get("query") or "").strip()
131 content_type = actor_input.get("contentType", "story")
132 sort_by = actor_input.get("sortBy", "relevance")
133 max_items = int(actor_input.get("maxItems", 100))
134 min_points = actor_input.get("minPoints")
135 min_comments = actor_input.get("minComments")
136 author = actor_input.get("author")
137 include_text = actor_input.get("includeCommentText", True)
138
139 created_after_ts = _date_to_timestamp(actor_input.get("createdAfter", ""))
140 created_before_ts = _date_to_timestamp(actor_input.get("createdBefore", ""))
141
142 base_url = BASE_DATE if sort_by == "date" else BASE_RELEVANCE
143 tags = build_tags(content_type, author)
144 numeric_filters = build_numeric_filters(
145 min_points if isinstance(min_points, int) else None,
146 min_comments if isinstance(min_comments, int) else None,
147 created_after_ts,
148 created_before_ts,
149 )
150
151 Actor.log.info(
152 f"Searching HN | query={query!r} type={content_type} sort={sort_by} "
153 f"tags={tags} numericFilters={numeric_filters or '(none)'} maxItems={max_items}"
154 )
155
156 pushed = 0
157 page = 0
158 total_pages: int | None = None
159
160 async with httpx.AsyncClient(timeout=30.0, headers={"User-Agent": "apify-hn-search/0.1"}) as client:
161 while pushed < max_items:
162 params = {
163 "tags": tags,
164 "hitsPerPage": min(PAGE_SIZE, max_items - pushed),
165 "page": page,
166 }
167 if query:
168 params["query"] = query
169 if numeric_filters:
170 params["numericFilters"] = numeric_filters
171
172 url = f"{base_url}?{urlencode(params)}"
173
174
175 data = None
176 for attempt in range(1, 4):
177 try:
178 resp = await client.get(url)
179 if resp.status_code == 429:
180 wait = attempt * 3
181 Actor.log.warning(f"Rate limited (429). Backing off {wait}s...")
182 await asyncio.sleep(wait)
183 continue
184 resp.raise_for_status()
185 data = resp.json()
186 break
187 except (httpx.HTTPError, ValueError) as exc:
188 Actor.log.warning(f"Request attempt {attempt} failed: {exc}")
189 if attempt == 3:
190 Actor.log.error(f"Giving up on page {page} after 3 attempts.")
191 else:
192 await asyncio.sleep(attempt * 2)
193
194 if data is None:
195 break
196
197 hits = data.get("hits", [])
198 if total_pages is None:
199 total_pages = data.get("nbPages", 0)
200 Actor.log.info(
201 f"Matched {data.get('nbHits', 0)} items across {total_pages} pages."
202 )
203
204 if not hits:
205 Actor.log.info("No more results.")
206 break
207
208 batch = []
209 for hit in hits:
210 if pushed >= max_items:
211 break
212 batch.append(transform_hit(hit, include_text))
213 pushed += 1
214
215 if batch:
216
217
218
219
220 await Actor.push_data(batch)
221 Actor.log.info(f"Pushed {pushed}/{max_items} items (page {page}).")
222
223 page += 1
224 if total_pages is not None and page >= total_pages:
225 Actor.log.info("Reached last available page.")
226 break
227
228 Actor.log.info(f"Done. Returned {pushed} items.")