1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: true,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.gamespot.com/search/?i=reviews&q=war',
21
22 uniqueIdField: 'reviewUrl',
23
24 waitSelector: 'li.media.clearfix',
25
26 autoScrollPage: async (page) => {
27
28
29
30
31 const nextButton = await page.$('a[title="Next page"]');
32 if (nextButton) {
33 await nextButton.click();
34 }
35 }
36};
37
38
39
40
41class BaseScraper {
42
43
44
45
46
47
48
49 parsePageItems($) {
50
51
52 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
53
54
55
56
57 const $item = $(el);
58 const titleLink = $item.find('h4.media-title a');
59 const imageLink = $item.find('.media-figure a');
60 const image = $item.find('.media-img img');
61 const dateSpan = $item.find('time.media-date span[itemprop="datePublished"]');
62 const deck = $item.find('p.media-deck');
63
64 return {
65
66 reviewTitle: titleLink.text().trim() || '',
67 reviewUrl: imageLink.attr('href') ? 'https://www.gamespot.com' + imageLink.attr('href') : '',
68 imageUrl: image.attr('src') || '',
69 publishedDate: dateSpan.text().trim() || '',
70 deck: deck.text().trim() || ''
71 };
72 }).get();
73
74
75
76 return items;
77 }
78
79 async run(input) {
80 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
81 this.maxItems = maxItems;
82 this.searchUrl = searchUrl;
83 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
84 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
85 const proxyOptions = this.buildProxyOptions(proxyUrl);
86
87 const realBrowserOption = {
88 args: ["--start-maximized"],
89 turnstile: SCRAPER_CONFIG.turnstile,
90 headless: SCRAPER_CONFIG.headless,
91
92
93 customConfig: {},
94 connectOption: {
95 defaultViewport: null
96 },
97 ...(proxyOptions ? { proxy: proxyOptions } : {}),
98 plugins: []
99 }
100
101 const { page, browser } = await connectRealBrowser(realBrowserOption);
102
103 try {
104 page.setDefaultNavigationTimeout(300 * 1000);
105 page.setDefaultTimeout(3600 * 1000);
106
107
108
109
110
111 await this.handlePage(page, this.searchUrl);
112 } finally {
113 await browser?.close();
114 }
115 }
116
117 async handlePage(page, url) {
118 console.log(`Processing ${url}...`);
119 let count = 0;
120 let endOfResults = false;
121 let interceptedData = [];
122 let lastDataLength = 0;
123 let noNewDataCount = 0;
124
125
126 await page.goto(url);
127
128
129 if (SCRAPER_CONFIG.waitSelector) {
130 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
131 console.log(`data not found...`);
132 });
133 }
134
135
136 const firstPageHtml = await page.content();
137 await this.processResponseData(firstPageHtml, interceptedData);
138
139
140 while (!endOfResults && interceptedData.length < this.maxItems) {
141 count++;
142 console.log(`data fetching...${count}`);
143 await this.randomDelay();
144
145
146 await SCRAPER_CONFIG.autoScrollPage(page);
147 await this.randomDelay(3000, 5000);
148
149
150 if (SCRAPER_CONFIG.waitSelector) {
151 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
152 console.log(`data not found...`);
153 });
154 }
155
156
157 const afterScrollHtml = await page.content();
158 await this.processResponseData(afterScrollHtml, interceptedData);
159
160 if (interceptedData.length === lastDataLength) {
161 noNewDataCount++;
162 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
163 endOfResults = true;
164 }
165 } else {
166 noNewDataCount = 0;
167 lastDataLength = interceptedData.length;
168 }
169 }
170
171 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
172 }
173
174
175 async setupInterceptors(page, interceptedData) {
176 await page.setRequestInterception(true);
177 page.on('request', (request) => {
178 const resourceType = request.resourceType();
179 const blockedResourceTypes = [
180 'image',
181 'media',
182 'font',
183 'texttrack',
184 'stylesheet',
185 'ping'
186 ];
187
188 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
189 request.abort();
190 } else {
191 request.continue();
192 }
193 });
194 page.on('response', async (response) => {
195
196 });
197 }
198
199
200 async processResponseData(htmlContent, interceptedData) {
201 let currentData = [];
202
203 if (!htmlContent) {
204 return;
205 }
206
207 const $ = cheerio.load(htmlContent);
208
209
210 const items = this.parsePageItems($);
211
212 if (!items || items.length === 0) {
213 return;
214 }
215
216
217 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
218 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
219
220 let skippedCount = 0;
221 for (const listing of items) {
222
223 if (existingIds.has(listing[uniqueField])) {
224 skippedCount++;
225 continue;
226 }
227
228 const data = {
229 searchUrl: this.searchUrl,
230 ...listing,
231 scrapedAt: new Date().toISOString()
232 };
233
234 interceptedData.push(data);
235 currentData.push(data);
236 existingIds.add(listing[uniqueField]);
237
238
239 if (interceptedData.length >= this.maxItems) {
240 break;
241 }
242 }
243
244
245
246 if (currentData.length > 0) {
247 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
248 await Actor.pushData(currentData);
249 }
250 }
251
252
253 async randomDelay(min = 1000, max = 3000) {
254 const delay = Math.floor(Math.random() * (max - min + 1) + min);
255 await new Promise(resolve => setTimeout(resolve, delay));
256 }
257
258
259 async setCookies(page) {
260
261
262
263
264
265
266
267
268
269
270
271 }
272
273
274 buildProxyOptions(proxyUrl) {
275 if (!proxyUrl) {
276 return undefined;
277 }
278
279 try {
280 const parsed = new URL(proxyUrl);
281 if (!parsed.hostname || !parsed.port) {
282 return undefined;
283 }
284
285 return {
286 host: parsed.hostname,
287 port: Number(parsed.port),
288 username: parsed.username || undefined,
289 password: parsed.password || undefined,
290 };
291 } catch (error) {
292 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
293 return undefined;
294 }
295 }
296
297}
298
299
300
301
302
303await Actor.init();
304
305Actor.main(async () => {
306 let input = await Actor.getInput();
307 if (!input) {
308 input = {
309 "searchUrl": SCRAPER_CONFIG.requestUrl,
310 "maxItems": SCRAPER_CONFIG.maxItems
311 };
312 }
313 const crawler = new BaseScraper();
314 await crawler.run(input);
315});