1import { Actor } from 'apify';
2import axios from 'axios';
3import * as cheerio from 'cheerio';
4
5const PAGE_URL = 'https://gearbox.crunchprank.net/twitch/tags.php';
6
7const DEFAULT_HEADERS = {
8 accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
9 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
10 'cache-control': 'no-cache',
11 pragma: 'no-cache',
12 'user-agent':
13 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36',
14};
15
16class TwitchTagsScraper {
17 normalizeOptionalString(value) {
18 if (value === undefined || value === null) return '';
19 return String(value).trim();
20 }
21
22 getAxiosConfig(proxyUrl) {
23 if (!proxyUrl) return {};
24
25 const parsed = new URL(proxyUrl);
26 return {
27 proxy: {
28 protocol: parsed.protocol.replace(':', ''),
29 host: parsed.hostname,
30 port: Number(parsed.port),
31 ...(parsed.username
32 ? {
33 auth: {
34 username: decodeURIComponent(parsed.username),
35 password: decodeURIComponent(parsed.password),
36 },
37 }
38 : {}),
39 },
40 };
41 }
42
43 parseLastUpdated($) {
44 const text = $('small.fst-italic').first().text().trim();
45 const match = text.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
46 return match ? match[1] : text || null;
47 }
48
49 parseTagsFromHtml(html) {
50 const $ = cheerio.load(html);
51 const lastUpdated = this.parseLastUpdated($);
52 const tags = [];
53
54 $('table.table.table-bordered.table-striped tbody#myTable tr').each((_, row) => {
55 const cells = $(row).find('td');
56 if (cells.length < 2) return;
57
58 const tag = $(cells[0]).text().trim();
59 const countText = $(cells[1]).text().trim();
60 const count = Number(countText.replace(/,/g, ''));
61
62 if (!tag) return;
63
64 tags.push({
65 tag,
66 count: Number.isFinite(count) ? count : null,
67 });
68 });
69
70 return { tags, lastUpdated };
71 }
72
73 filterTags(tags, input) {
74 const query = this.normalizeOptionalString(input.tagQuery).toLowerCase();
75 const minCount = Number(input.minCount);
76
77 return tags.filter((item) => {
78 if (query && !item.tag.toLowerCase().includes(query)) {
79 return false;
80 }
81
82 if (Number.isFinite(minCount) && minCount > 0 && (item.count ?? 0) < minCount) {
83 return false;
84 }
85
86 return true;
87 });
88 }
89
90 mapTagItem(tag, index, lastUpdated) {
91 return {
92 rank: index + 1,
93 tag: tag.tag,
94 count: tag.count,
95 lastUpdated,
96 scrapedAt: new Date().toISOString(),
97 };
98 }
99
100 async run(input) {
101 const { maxItems = Infinity, proxyConfiguration } = input;
102
103 this.maxItems = maxItems;
104 this.input = input;
105
106 const proxyConfig = proxyConfiguration
107 ? await Actor.createProxyConfiguration(proxyConfiguration)
108 : undefined;
109 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
110 const axiosConfig = this.getAxiosConfig(proxyUrl);
111
112 await this.scrapeTags(axiosConfig);
113 }
114
115 async scrapeTags(axiosConfig) {
116 console.log('Fetching Twitch tags page...');
117
118 let html;
119 try {
120 const response = await axios.get(PAGE_URL, {
121 ...axiosConfig,
122 headers: DEFAULT_HEADERS,
123 timeout: 120000,
124 });
125 html = response.data;
126 } catch (error) {
127 const message = error.response?.data?.message || error.message;
128 console.error('Failed to fetch tags page:', message);
129 await Actor.pushData([
130 {
131 error: message,
132 scrapedAt: new Date().toISOString(),
133 },
134 ]);
135 return;
136 }
137
138 const { tags, lastUpdated } = this.parseTagsFromHtml(html);
139 const filteredTags = this.filterTags(tags, this.input);
140
141 if (filteredTags.length === 0) {
142 console.log('No tags matched the filters');
143 await Actor.pushData([
144 {
145 error: 'No tags matched the filters',
146 lastUpdated,
147 scrapedAt: new Date().toISOString(),
148 },
149 ]);
150 return;
151 }
152
153 const limit = Number.isFinite(this.maxItems) ? this.maxItems : filteredTags.length;
154 const results = filteredTags
155 .slice(0, limit)
156 .map((item, index) => this.mapTagItem(item, index, lastUpdated));
157
158 console.log(`Saved ${results.length} tags (from ${tags.length} total)`);
159 await Actor.pushData(results);
160 }
161}
162
163await Actor.init();
164
165Actor.main(async () => {
166 const input = await Actor.getInput();
167
168
169
170
171
172
173
174
175 const scraper = new TwitchTagsScraper();
176 await scraper.run(input);
177});