1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: true,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.productreview.com.au/c/air-fryers?brands=ninja%2Cphilips-domestic-appliances%2Csunbeam%2Ctefal',
21
22 uniqueIdField: 'uniqueId',
23
24 waitSelector: '[data-ctx-namespace][data-ctx-influx]',
25
26 autoScrollPage: async (page) => {
27
28
29
30
31 await page.evaluate(() => {
32 const button = document.querySelector('button[data-track-ga-click="{·category·:·load_more_results·}"]');
33 if (button) {
34 button.click();
35 }
36 });
37 }
38};
39
40
41
42
43class BaseScraper {
44
45
46
47
48
49
50
51 parsePageItems($) {
52
53
54 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
55
56
57
58
59 const $item = $(el);
60
61
62 const influxData = $item.attr('data-ctx-influx') || '';
63 let uniqueId = '';
64 try {
65 const parsed = JSON.parse(influxData.replace(/·/g, '"'));
66 uniqueId = parsed.targetListingId || '';
67 } catch (e) {}
68
69
70 const gaData = $item.attr('data-ctx-ga') || '';
71 let brand = '';
72 try {
73 const parsed = JSON.parse(gaData.replace(/·/g, '"'));
74 brand = parsed.dimensions?.targetBrand || '';
75 } catch (e) {}
76
77
78 const $titleLink = $item.find('a.dxgoRW').first();
79 const title = $titleLink.text().trim() || '';
80 const listingUrl = $titleLink.attr('href') || '';
81 const productUrl = listingUrl ? `https://www.productreview.com.au${listingUrl}` : '';
82
83
84 const $image = $item.find('img#listing-header-main-picture').first();
85 const imageUrl = $image.attr('src') || '';
86 const imageAlt = $image.attr('alt') || '';
87
88
89 const $rating = $item.find('span.Ydzmmx').first();
90 const ratingText = $rating.clone().children().remove().end().text().trim();
91 const ratingCount = $rating.attr('data-count') || '0';
92 const ratingValue = parseFloat(ratingText) || 0;
93
94
95 const $priceItem = $item.find('span.MvDLkz:contains("Price")').parent();
96 const priceText = $priceItem.text().replace(/Price.*?:?/g, '').trim() || '';
97
98
99 const $capacityItem = $item.find('span.MvDLkz:contains("Capacity")').parent();
100 const capacityText = $capacityItem.text().replace(/Capacity.*?:?/g, '').trim() || '';
101
102 if (uniqueId === '') {
103 return;
104 }
105 return {
106 uniqueId: uniqueId,
107 productTitle: title,
108 productUrl: productUrl,
109 imageUrl: imageUrl,
110 imageAlt: imageAlt,
111 brand: brand,
112 ratingValue: ratingValue,
113 ratingCount: parseInt(ratingCount) || 0,
114 priceText: priceText,
115 capacityText: capacityText,
116 };
117 }).get();
118
119
120
121 return items;
122 }
123
124 async run(input) {
125 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
126 this.maxItems = maxItems;
127 this.searchUrl = searchUrl;
128 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
129 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
130 const proxyOptions = this.buildProxyOptions(proxyUrl);
131
132 const realBrowserOption = {
133 args: ["--start-maximized"],
134 turnstile: SCRAPER_CONFIG.turnstile,
135 headless: SCRAPER_CONFIG.headless,
136
137
138 customConfig: {},
139 connectOption: {
140 defaultViewport: null
141 },
142 ...(proxyOptions ? { proxy: proxyOptions } : {}),
143 plugins: []
144 }
145
146 const { page, browser } = await connectRealBrowser(realBrowserOption);
147
148 try {
149 page.setDefaultNavigationTimeout(300 * 1000);
150 page.setDefaultTimeout(3600 * 1000);
151
152
153
154
155
156 await this.handlePage(page, this.searchUrl);
157 } finally {
158 await browser?.close();
159 }
160 }
161
162 async handlePage(page, url) {
163 console.log(`Processing ${url}...`);
164 let count = 0;
165 let endOfResults = false;
166 let interceptedData = [];
167 let lastDataLength = 0;
168 let noNewDataCount = 0;
169
170
171 await page.goto(url);
172
173
174 if (SCRAPER_CONFIG.waitSelector) {
175 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
176 console.log(`data not found...`);
177 });
178 }
179
180
181 const firstPageHtml = await page.content();
182 await this.processResponseData(firstPageHtml, interceptedData);
183
184
185 while (!endOfResults && interceptedData.length < this.maxItems) {
186 count++;
187 console.log(`data fetching...${count}`);
188 await this.randomDelay();
189
190
191 await SCRAPER_CONFIG.autoScrollPage(page);
192 await this.randomDelay(3000, 5000);
193
194
195 if (SCRAPER_CONFIG.waitSelector) {
196 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
197 console.log(`data not found...`);
198 });
199 }
200
201
202 const afterScrollHtml = await page.content();
203 await this.processResponseData(afterScrollHtml, interceptedData);
204
205 if (interceptedData.length === lastDataLength) {
206 noNewDataCount++;
207 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
208 endOfResults = true;
209 }
210 } else {
211 noNewDataCount = 0;
212 lastDataLength = interceptedData.length;
213 }
214 }
215
216 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
217 }
218
219
220 async setupInterceptors(page, interceptedData) {
221 await page.setRequestInterception(true);
222 page.on('request', (request) => {
223 const resourceType = request.resourceType();
224 const blockedResourceTypes = [
225 'image',
226 'media',
227 'font',
228 'texttrack',
229 'stylesheet',
230 'ping'
231 ];
232
233 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
234 request.abort();
235 } else {
236 request.continue();
237 }
238 });
239 page.on('response', async (response) => {
240
241 });
242 }
243
244
245 async processResponseData(htmlContent, interceptedData) {
246 let currentData = [];
247
248 if (!htmlContent) {
249 return;
250 }
251
252 const $ = cheerio.load(htmlContent);
253
254
255 const items = this.parsePageItems($);
256
257 if (!items || items.length === 0) {
258 return;
259 }
260
261
262 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
263 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
264
265 let skippedCount = 0;
266 for (const listing of items) {
267
268 if (existingIds.has(listing[uniqueField])) {
269 skippedCount++;
270 continue;
271 }
272
273 const data = {
274 searchUrl: this.searchUrl,
275 ...listing,
276 scrapedAt: new Date().toISOString()
277 };
278
279 interceptedData.push(data);
280 currentData.push(data);
281 existingIds.add(listing[uniqueField]);
282
283
284 if (interceptedData.length >= this.maxItems) {
285 break;
286 }
287 }
288
289
290
291 if (currentData.length > 0) {
292 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
293 await Actor.pushData(currentData);
294 }
295 }
296
297
298 async randomDelay(min = 1000, max = 3000) {
299 const delay = Math.floor(Math.random() * (max - min + 1) + min);
300 await new Promise(resolve => setTimeout(resolve, delay));
301 }
302
303
304 async setCookies(page) {
305
306
307
308
309
310
311
312
313
314
315
316 }
317
318
319 buildProxyOptions(proxyUrl) {
320 if (!proxyUrl) {
321 return undefined;
322 }
323
324 try {
325 const parsed = new URL(proxyUrl);
326 if (!parsed.hostname || !parsed.port) {
327 return undefined;
328 }
329
330 return {
331 host: parsed.hostname,
332 port: Number(parsed.port),
333 username: parsed.username || undefined,
334 password: parsed.password || undefined,
335 };
336 } catch (error) {
337 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
338 return undefined;
339 }
340 }
341
342}
343
344
345
346
347
348await Actor.init();
349
350Actor.main(async () => {
351 let input = await Actor.getInput();
352 if (!input) {
353 input = {
354 "searchUrl": SCRAPER_CONFIG.requestUrl,
355 "maxItems": SCRAPER_CONFIG.maxItems
356 };
357 }
358 const crawler = new BaseScraper();
359 await crawler.run(input);
360});