1import { Actor } from 'apify';
2import axios from 'axios';
3
4const API_URL = 'https://arctic-shift.photon-reddit.com/api/comments/search';
5const SITE_BASE = 'https://arctic-shift.photon-reddit.com';
6
7const DEFAULT_HEADERS = {
8 accept: '*/*',
9 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
10 'cache-control': 'no-cache',
11 pragma: 'no-cache',
12 'user-agent':
13 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
14};
15
16const SCOPE_FIELDS = ['subreddit', 'author', 'link_id', 'parent_id'];
17
18class RedditCommentsSearchScraper {
19 normalizeOptionalString(value) {
20 if (value === undefined || value === null) return '';
21 return String(value).trim();
22 }
23
24 parseTimestamp(value) {
25 const text = this.normalizeOptionalString(value);
26 if (!text) return undefined;
27
28 if (/^\d+$/.test(text)) {
29 return Number(text);
30 }
31
32 const parsed = Date.parse(text);
33 if (!Number.isNaN(parsed)) {
34 return Math.floor(parsed / 1000);
35 }
36
37 return undefined;
38 }
39
40 formatBeforeParam(value) {
41 const text = this.normalizeOptionalString(value);
42 if (!text) return undefined;
43
44 if (/^\d+$/.test(text)) {
45 return Number(text);
46 }
47
48 if (text.includes('T') || text.includes('-')) {
49 return text;
50 }
51
52 const parsed = Date.parse(text);
53 if (!Number.isNaN(parsed)) {
54 return text;
55 }
56
57 return text;
58 }
59
60 buildSearchConfig(input) {
61 const subreddit = this.normalizeOptionalString(input.subreddit);
62 const author = this.normalizeOptionalString(input.author);
63 const linkId = this.normalizeOptionalString(input.link_id);
64 const parentId = this.normalizeOptionalString(input.parent_id);
65 const body = this.normalizeOptionalString(input.body);
66
67 const hasScope = [subreddit, author, linkId, parentId].some(Boolean);
68 if (!hasScope) {
69 throw new Error('Input must include at least one of: subreddit, author, link_id, parent_id');
70 }
71
72 if (body && !hasScope) {
73 throw new Error('body search requires at least one of: subreddit, author, link_id, parent_id');
74 }
75
76 const limit = Number(input.limit);
77 const pageLimit = Number.isFinite(limit) ? Math.min(Math.max(limit, 1), 100) : 100;
78 const sort = input.sort === 'asc' ? 'asc' : 'desc';
79
80 return {
81 subreddit,
82 sort,
83 pageLimit,
84 sortField: sort === 'asc' ? 'after' : 'before',
85 params: this.buildStaticParams(input),
86 };
87 }
88
89 buildStaticParams(input) {
90 const params = {
91 sort: input.sort === 'asc' ? 'asc' : 'desc',
92 limit: Math.min(Math.max(Number(input.limit) || 100, 1), 100),
93 md2html: true,
94 'meta-app': 'search-tool',
95 };
96
97 for (const field of SCOPE_FIELDS) {
98 const value = this.normalizeOptionalString(input[field]);
99 if (value) {
100 params[field] = value;
101 }
102 }
103
104 const after = this.parseTimestamp(input.after);
105 if (after !== undefined) {
106 params.after = after;
107 }
108
109 const before = this.formatBeforeParam(input.before);
110 if (before !== undefined) {
111 params.before = before;
112 }
113
114 const body = this.normalizeOptionalString(input.body);
115 if (body) {
116 params.body = body;
117 }
118
119 return params;
120 }
121
122 buildReferer(params) {
123 const refererParams = { fun: 'comments_search' };
124 for (const [key, value] of Object.entries(params)) {
125 if (key === 'md2html' || key === 'meta-app') continue;
126 if (value !== undefined && value !== null && value !== '') {
127 refererParams[key] = value;
128 }
129 }
130 const query = new URLSearchParams(refererParams);
131 return `${SITE_BASE}/search?${query.toString()}`;
132 }
133
134 getAxiosConfig(proxyUrl) {
135 if (!proxyUrl) return {};
136
137 const parsed = new URL(proxyUrl);
138 return {
139 proxy: {
140 protocol: parsed.protocol.replace(':', ''),
141 host: parsed.hostname,
142 port: Number(parsed.port),
143 ...(parsed.username
144 ? {
145 auth: {
146 username: decodeURIComponent(parsed.username),
147 password: decodeURIComponent(parsed.password),
148 },
149 }
150 : {}),
151 },
152 };
153 }
154
155 mapResultItem(item, searchConfig) {
156 return {
157 subreddit: searchConfig.subreddit || item.subreddit,
158 ...item,
159 scrapedAt: new Date().toISOString(),
160 };
161 }
162
163 async run(input) {
164 const { maxItems = Infinity, maxPages = Infinity, proxyConfiguration } = input;
165
166 this.maxItems = maxItems;
167 this.maxPages = maxPages;
168 this.searchConfig = this.buildSearchConfig(input);
169
170 const proxyConfig = proxyConfiguration
171 ? await Actor.createProxyConfiguration(proxyConfiguration)
172 : undefined;
173 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
174 const axiosConfig = this.getAxiosConfig(proxyUrl);
175
176 await this.scrapeSearch(axiosConfig);
177 }
178
179 async scrapeSearch(axiosConfig) {
180 const { subreddit, sortField, pageLimit } = this.searchConfig;
181 const seenIds = new Set();
182 let page = 0;
183 let totalSaved = 0;
184 let cursorParams = { ...this.searchConfig.params };
185
186 const label = subreddit ? `r/${subreddit}` : 'comments';
187 console.log(`Searching ${label}...`);
188
189 while (page < this.maxPages && totalSaved < this.maxItems) {
190 page++;
191 console.log(`Fetching page ${page} for ${label}...`);
192
193 let responseBody;
194 try {
195 const response = await axios.get(API_URL, {
196 ...axiosConfig,
197 params: cursorParams,
198 headers: {
199 ...DEFAULT_HEADERS,
200 Referer: this.buildReferer(cursorParams),
201 },
202 timeout: 120000,
203 });
204 responseBody = response.data;
205 } catch (error) {
206 const message = error.response?.data?.error || error.message;
207 console.error(`API request failed on page ${page}:`, message);
208 if (totalSaved === 0) {
209 await Actor.pushData([
210 {
211 subreddit: subreddit || null,
212 error: message,
213 scrapedAt: new Date().toISOString(),
214 },
215 ]);
216 }
217 break;
218 }
219
220 const items = Array.isArray(responseBody?.data) ? responseBody.data : [];
221 if (items.length === 0) {
222 console.log(`No more results on page ${page}`);
223 break;
224 }
225
226 const currentData = [];
227 for (const item of items) {
228 if (totalSaved >= this.maxItems) break;
229
230 const dedupeKey = item.id || item.name;
231 if (dedupeKey && seenIds.has(dedupeKey)) continue;
232 if (dedupeKey) seenIds.add(dedupeKey);
233
234 currentData.push(this.mapResultItem(item, this.searchConfig));
235 totalSaved++;
236 }
237
238 if (currentData.length > 0) {
239 console.log(`Saved ${currentData.length} items from page ${page}`);
240 await Actor.pushData(currentData);
241 }
242
243 if (totalSaved >= this.maxItems) break;
244 if (items.length < pageLimit) {
245 console.log('Last page received');
246 break;
247 }
248
249 const lastItem = items[items.length - 1];
250 const cursorValue = lastItem?.created;
251 if (!cursorValue) {
252 console.log('Missing created timestamp for pagination');
253 break;
254 }
255
256 cursorParams = {
257 ...this.searchConfig.params,
258 [sortField]: cursorValue,
259 };
260
261 await this.randomDelay(1000, 2000);
262 }
263
264 console.log(`Finished ${label} with ${totalSaved} comments`);
265 }
266
267 async randomDelay(min = 1000, max = 3000) {
268 const delay = Math.floor(Math.random() * (max - min + 1) + min);
269 await new Promise((resolve) => setTimeout(resolve, delay));
270 }
271}
272
273await Actor.init();
274
275Actor.main(async () => {
276
277
278 const input = {
279 "subreddit": "chatgpt",
280 "body": "token",
281 "sort": "desc",
282 "limit": 10,
283 "maxItems": 50
284 };
285
286 const scraper = new RedditCommentsSearchScraper();
287 await scraper.run(input);
288});