1import { Actor } from 'apify';
2import axios from 'axios';
3
4const API_BASE = 'https://arctic-shift.photon-reddit.com/api';
5const REFERER = 'https://ihsoyct.github.io/';
6
7const DEFAULT_HEADERS = {
8 accept: 'application/json, text/plain, */*',
9 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
10 'cache-control': 'no-cache',
11 pragma: 'no-cache',
12 Referer: REFERER,
13 'user-agent':
14 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
15};
16
17const SUBMISSION_FIELDS = [
18 'query',
19 'title',
20 'selftext',
21 'link_flair_text',
22 'url',
23 'crosspost_parent_id',
24];
25
26const COMMON_FIELDS = ['author', 'author_flair_text', 'after', 'before'];
27
28class RedditSearchScraper {
29 normalizeOptionalString(value) {
30 if (value === undefined || value === null) return '';
31 return String(value).trim();
32 }
33
34 parseTimestamp(value) {
35 const text = this.normalizeOptionalString(value);
36 if (!text) return undefined;
37
38 if (/^\d+$/.test(text)) {
39 return Number(text);
40 }
41
42 const parsed = Date.parse(text);
43 if (!Number.isNaN(parsed)) {
44 return Math.floor(parsed / 1000);
45 }
46
47 return undefined;
48 }
49
50 parseBooleanParam(value) {
51 const text = this.normalizeOptionalString(value).toLowerCase();
52 if (text === 'true') return true;
53 if (text === 'false') return false;
54 return undefined;
55 }
56
57 buildSearchConfig(input) {
58 const searchType = input.searchType === 'comments' ? 'comments' : 'submissions';
59 const subreddit = this.normalizeOptionalString(input.subreddit);
60 if (!subreddit) {
61 throw new Error('Input must include subreddit');
62 }
63
64 const limit = Number(input.limit);
65 const pageLimit = Number.isFinite(limit) ? Math.min(Math.max(limit, 1), 100) : 100;
66
67 return {
68 searchType,
69 subreddit,
70 sort: input.sort === 'asc' ? 'asc' : 'desc',
71 pageLimit,
72 sortField: input.sort === 'asc' ? 'after' : 'before',
73 params: this.buildStaticParams(input, searchType),
74 };
75 }
76
77 buildStaticParams(input, searchType) {
78 const params = {
79 sort: input.sort === 'asc' ? 'asc' : 'desc',
80 subreddit: this.normalizeOptionalString(input.subreddit),
81 limit: Math.min(Math.max(Number(input.limit) || 100, 1), 100),
82 };
83
84 for (const field of COMMON_FIELDS) {
85 if (field === 'after' || field === 'before') {
86 const timestamp = this.parseTimestamp(input[field]);
87 if (timestamp !== undefined) {
88 params[field] = timestamp;
89 }
90 continue;
91 }
92
93 const value = this.normalizeOptionalString(input[field]);
94 if (value) {
95 params[field] = value;
96 }
97 }
98
99 const query = this.normalizeOptionalString(input.query);
100 if (query) {
101 if (searchType === 'comments') {
102 params.body = query;
103 } else {
104 params.query = query;
105 }
106 }
107
108 if (searchType === 'submissions') {
109 for (const field of SUBMISSION_FIELDS) {
110 const value = this.normalizeOptionalString(input[field]);
111 if (value) {
112 params[field] = value;
113 }
114 }
115
116 const over18 = this.parseBooleanParam(input.over_18);
117 if (over18 !== undefined) {
118 params.over_18 = over18;
119 }
120
121 const spoiler = this.parseBooleanParam(input.spoiler);
122 if (spoiler !== undefined) {
123 params.spoiler = spoiler;
124 }
125
126 const urlExact = this.parseBooleanParam(input.url_exact);
127 if (urlExact !== undefined) {
128 params.url_exact = urlExact;
129 }
130 }
131
132 return params;
133 }
134
135 getApiUrl(searchType) {
136 return searchType === 'comments'
137 ? `${API_BASE}/comments/search`
138 : `${API_BASE}/posts/search`;
139 }
140
141 getAxiosConfig(proxyUrl) {
142 if (!proxyUrl) return {};
143
144 const parsed = new URL(proxyUrl);
145 return {
146 proxy: {
147 protocol: parsed.protocol.replace(':', ''),
148 host: parsed.hostname,
149 port: Number(parsed.port),
150 ...(parsed.username
151 ? {
152 auth: {
153 username: decodeURIComponent(parsed.username),
154 password: decodeURIComponent(parsed.password),
155 },
156 }
157 : {}),
158 },
159 };
160 }
161
162 mapResultItem(item, searchConfig) {
163 return {
164 searchType: searchConfig.searchType,
165 subreddit: searchConfig.subreddit,
166 ...item,
167 scrapedAt: new Date().toISOString(),
168 };
169 }
170
171 async run(input) {
172 const { maxItems = Infinity, maxPages = Infinity, proxyConfiguration } = input;
173
174 this.maxItems = maxItems;
175 this.maxPages = maxPages;
176 this.searchConfig = this.buildSearchConfig(input);
177
178 const proxyConfig = proxyConfiguration
179 ? await Actor.createProxyConfiguration(proxyConfiguration)
180 : undefined;
181 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
182 const axiosConfig = this.getAxiosConfig(proxyUrl);
183
184 await this.scrapeSearch(axiosConfig);
185 }
186
187 async scrapeSearch(axiosConfig) {
188 const { searchType, subreddit, sortField, pageLimit } = this.searchConfig;
189 const apiUrl = this.getApiUrl(searchType);
190 const seenIds = new Set();
191 let page = 0;
192 let totalSaved = 0;
193 let cursorParams = { ...this.searchConfig.params };
194
195 console.log(`Searching r/${subreddit} (${searchType})...`);
196
197 while (page < this.maxPages && totalSaved < this.maxItems) {
198 page++;
199 console.log(`Fetching page ${page} for r/${subreddit}...`);
200
201 let responseBody;
202 try {
203 const response = await axios.get(apiUrl, {
204 ...axiosConfig,
205 params: cursorParams,
206 headers: DEFAULT_HEADERS,
207 timeout: 120000,
208 });
209 responseBody = response.data;
210 } catch (error) {
211 const message = error.response?.data?.error || error.message;
212 console.error(`API request failed on page ${page}:`, message);
213 if (totalSaved === 0) {
214 await Actor.pushData([
215 {
216 searchType,
217 subreddit,
218 error: message,
219 scrapedAt: new Date().toISOString(),
220 },
221 ]);
222 }
223 break;
224 }
225
226 const items = Array.isArray(responseBody?.data) ? responseBody.data : [];
227 if (items.length === 0) {
228 console.log(`No more results on page ${page}`);
229 break;
230 }
231
232 const currentData = [];
233 for (const item of items) {
234 if (totalSaved >= this.maxItems) break;
235
236 const dedupeKey = item.id || item.name;
237 if (dedupeKey && seenIds.has(dedupeKey)) continue;
238 if (dedupeKey) seenIds.add(dedupeKey);
239
240 currentData.push(this.mapResultItem(item, this.searchConfig));
241 totalSaved++;
242 }
243
244 if (currentData.length > 0) {
245 console.log(`Saved ${currentData.length} items from page ${page}`);
246 await Actor.pushData(currentData);
247 }
248
249 if (totalSaved >= this.maxItems) break;
250 if (items.length < pageLimit) {
251 console.log('Last page received');
252 break;
253 }
254
255 const lastItem = items[items.length - 1];
256 const cursorValue = lastItem?.created;
257 if (!cursorValue) {
258 console.log('Missing created timestamp for pagination');
259 break;
260 }
261
262 cursorParams = {
263 ...this.searchConfig.params,
264 [sortField]: cursorValue,
265 };
266
267 await this.randomDelay(1000, 2000);
268 }
269
270 console.log(`Finished r/${subreddit} with ${totalSaved} items`);
271 }
272
273 async randomDelay(min = 1000, max = 3000) {
274 const delay = Math.floor(Math.random() * (max - min + 1) + min);
275 await new Promise((resolve) => setTimeout(resolve, delay));
276 }
277}
278
279await Actor.init();
280
281Actor.main(async () => {
282 const input = await Actor.getInput();
283
284
285
286
287
288
289
290
291
292
293 const scraper = new RedditSearchScraper();
294 await scraper.run(input);
295});