1import { Actor } from 'apify';
2import axios from 'axios';
3import * as cheerio from 'cheerio';
4
5const PAGE_URL = 'https://gearbox.crunchprank.net/twitch/filter.php';
6
7const DEFAULT_HEADERS = {
8 accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
10 'cache-control': 'no-cache',
11 pragma: 'no-cache',
12 'user-agent':
13 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
14};
15
16class TwitchStreamsSearchScraper {
17 normalizeOptionalString(value) {
18 if (value === undefined || value === null) return '';
19 return String(value).trim();
20 }
21
22 parseOptionalNumber(value) {
23 const text = this.normalizeOptionalString(value);
24 if (!text) return undefined;
25 const num = Number(text);
26 return Number.isFinite(num) ? num : undefined;
27 }
28
29 buildQueryParams(input, page = 1) {
30 const params = {};
31
32 const search = this.normalizeOptionalString(input.search);
33 if (search) params.search = search;
34
35 const username = this.normalizeOptionalString(input.username);
36 if (username) params.username = username;
37
38 if (input.useUsernameRegex) {
39 params.useUsernameRegex = 'on';
40 }
41
42 const games = this.normalizeOptionalString(input.games);
43 if (games) params.games = games;
44
45 if (input.useRegex) {
46 params.useRegex = 'on';
47 }
48
49 const tags = this.normalizeOptionalString(input.tags);
50 if (tags) params.tags = tags;
51
52 if (input.includeAllTags) {
53 params.includeAllTags = '1';
54 }
55
56 const excludeTags = this.normalizeOptionalString(input.excludeTags);
57 if (excludeTags) params.excludeTags = excludeTags;
58
59 const broadcasterType = this.normalizeOptionalString(input.broadcasterType);
60 if (broadcasterType) params.broadcasterType = broadcasterType;
61
62 const language = this.normalizeOptionalString(input.language);
63 if (language) params.language = language;
64
65 const sortOrder = this.normalizeOptionalString(input.sortOrder) || 'viewCountDesc';
66 params.sortOrder = sortOrder;
67
68 const minViewers = this.parseOptionalNumber(input.minViewers);
69 if (minViewers !== undefined && minViewers > 0) params.minViewers = minViewers;
70
71 const maxViewers = this.parseOptionalNumber(input.maxViewers);
72 if (maxViewers !== undefined && maxViewers > 0) params.maxViewers = maxViewers;
73
74 if (page > 1) {
75 params.page = page;
76 }
77
78 return params;
79 }
80
81 getAxiosConfig(proxyUrl) {
82 if (!proxyUrl) return {};
83
84 const parsed = new URL(proxyUrl);
85 return {
86 proxy: {
87 protocol: parsed.protocol.replace(':', ''),
88 host: parsed.hostname,
89 port: Number(parsed.port),
90 ...(parsed.username
91 ? {
92 auth: {
93 username: decodeURIComponent(parsed.username),
94 password: decodeURIComponent(parsed.password),
95 },
96 }
97 : {}),
98 },
99 };
100 }
101
102 extractField(contentHtml, label) {
103 const pattern = new RegExp(`<strong>${label}</strong>:\\s*([^<]+)`, 'i');
104 const match = contentHtml.match(pattern);
105 return match ? match[1].trim() : null;
106 }
107
108 parseStreamsFromHtml(html) {
109 const $ = cheerio.load(html);
110 const totalText = $('p')
111 .filter((_, el) => $(el).text().includes('Total Streams Found'))
112 .first()
113 .text()
114 .trim();
115 const totalMatch = totalText.match(/Total Streams Found:\s*([\d,]+)/i);
116 const totalStreamsFound = totalMatch
117 ? Number(totalMatch[1].replace(/,/g, ''))
118 : null;
119
120 const streams = [];
121
122 $('div.card.h-100').each((_, card) => {
123 const titleLink = $(card).find('.card-title a').first();
124 const title = titleLink.text().trim();
125 const streamUrl = titleLink.attr('href') || null;
126 const thumbnail = $(card).find('img.card-img-top').attr('src') || null;
127 const profileImage = $(card).find('img.rounded-circle').attr('src') || null;
128 const contentHtml = $(card).find('.card-content').html() || '';
129 const startedRaw = $(card).find('.card-footer').text().replace(/\s+/g, ' ').trim();
130 const started = startedRaw.replace(/^Started:\s*/i, '').trim() || null;
131
132 const username = this.extractField(contentHtml, 'Username');
133 const tagsText = this.extractField(contentHtml, 'Tags');
134 const viewersText = this.extractField(contentHtml, 'Viewers');
135 const viewers = viewersText ? Number(viewersText.replace(/,/g, '')) : null;
136
137 streams.push({
138 title: title || null,
139 streamUrl,
140 thumbnail,
141 profileImage,
142 game: this.extractField(contentHtml, 'Game'),
143 username,
144 language: this.extractField(contentHtml, 'Language'),
145 broadcasterType: this.extractField(contentHtml, 'Broadcaster Type'),
146 viewers: Number.isFinite(viewers) ? viewers : null,
147 tagsText,
148 tags: tagsText
149 ? tagsText.split(',').map((tag) => tag.trim()).filter(Boolean)
150 : [],
151 started,
152 });
153 });
154
155 const hasNextPage = $('a')
156 .toArray()
157 .some((el) => $(el).text().trim().toLowerCase() === 'next');
158
159 return { streams, totalStreamsFound, hasNextPage };
160 }
161
162 mapStreamItem(stream, rank, totalStreamsFound) {
163 return {
164 rank,
165 totalStreamsFound,
166 ...stream,
167 profileUrl: stream.username ? `https://www.twitch.tv/${stream.username}` : stream.streamUrl,
168 scrapedAt: new Date().toISOString(),
169 };
170 }
171
172 async run(input) {
173 const { maxItems = Infinity, proxyConfiguration } = input;
174
175 this.maxItems = maxItems;
176 this.input = input;
177
178 const proxyConfig = proxyConfiguration
179 ? await Actor.createProxyConfiguration(proxyConfiguration)
180 : undefined;
181 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
182 const axiosConfig = this.getAxiosConfig(proxyUrl);
183
184 await this.scrapeStreams(axiosConfig);
185 }
186
187 async scrapeStreams(axiosConfig) {
188 const seenUsernames = new Set();
189 let page = 1;
190 let totalSaved = 0;
191 let totalStreamsFound = null;
192
193 console.log('Searching Twitch streams...');
194
195 while (totalSaved < this.maxItems) {
196 const params = this.buildQueryParams(this.input, page);
197 console.log(`Fetching page ${page}...`);
198
199 let html;
200 try {
201 const response = await axios.get(PAGE_URL, {
202 ...axiosConfig,
203 params,
204 headers: DEFAULT_HEADERS,
205 timeout: 120000,
206 });
207 html = response.data;
208 } catch (error) {
209 const message = error.response?.data?.message || error.message;
210 console.error(`Failed to fetch page ${page}:`, message);
211 if (totalSaved === 0) {
212 await Actor.pushData([
213 {
214 error: message,
215 scrapedAt: new Date().toISOString(),
216 },
217 ]);
218 }
219 break;
220 }
221
222 const parsed = this.parseStreamsFromHtml(html);
223 if (totalStreamsFound === null) {
224 totalStreamsFound = parsed.totalStreamsFound;
225 console.log(`Total streams found: ${totalStreamsFound ?? 'unknown'}`);
226 }
227
228 if (parsed.streams.length === 0) {
229 console.log(`No streams on page ${page}`);
230 if (totalSaved === 0) {
231 await Actor.pushData([
232 {
233 error: 'No streams found',
234 totalStreamsFound,
235 scrapedAt: new Date().toISOString(),
236 },
237 ]);
238 }
239 break;
240 }
241
242 const currentData = [];
243 for (const stream of parsed.streams) {
244 if (totalSaved >= this.maxItems) break;
245
246 const dedupeKey = stream.username || stream.streamUrl;
247 if (dedupeKey && seenUsernames.has(dedupeKey)) continue;
248 if (dedupeKey) seenUsernames.add(dedupeKey);
249
250 totalSaved++;
251 currentData.push(this.mapStreamItem(stream, totalSaved, totalStreamsFound));
252 }
253
254 if (currentData.length > 0) {
255 console.log(`Saved ${currentData.length} streams from page ${page}`);
256 await Actor.pushData(currentData);
257 }
258
259 if (totalSaved >= this.maxItems) break;
260 if (!parsed.hasNextPage) {
261 console.log('Last page received');
262 break;
263 }
264
265 page++;
266 await this.randomDelay(800, 1500);
267 }
268
269 console.log(`Finished with ${totalSaved} streams`);
270 }
271
272 async randomDelay(min = 800, max = 1500) {
273 const delay = Math.floor(Math.random() * (max - min + 1) + min);
274 await new Promise((resolve) => setTimeout(resolve, delay));
275 }
276}
277
278await Actor.init();
279
280Actor.main(async () => {
281 const input = await Actor.getInput();
282
283
284
285
286
287
288
289
290
291
292
293
294 const scraper = new TwitchStreamsSearchScraper();
295 await scraper.run(input);
296});