1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: true,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.johnlewis.com/search?page=1&search-term=coat',
21
22 uniqueIdField: 'uniqueId',
23
24 waitSelector: 'article[data-testid="product-card-layout"]',
25
26 autoScrollPage: async (page) => {
27
28
29
30
31
32
33
34
35
36
37
38 await page.evaluate(async () => {
39 const nextButton = document.querySelector('a[data-testid="next-btn"]');
40 if (nextButton) {
41 nextButton.click();
42 }
43 });
44 }
45};
46
47
48
49
50class BaseScraper {
51
52
53
54
55
56
57
58 parsePageItems($) {
59
60
61 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
62
63
64
65
66 const $item = $(el);
67 const productId = $item.attr('data-product-id') || '';
68
69
70 const imageEl = $item.find('img[data-testid="product-image"]').first();
71 const imageUrl = imageEl.attr('src') || imageEl.attr('srcset')?.split(',')[0]?.trim()?.split(' ')[0] || '';
72
73
74 const titleEl = $item.find('h2[data-testid="product-title"]');
75 const titleText = titleEl.text().trim();
76 const productUrl = titleEl.find('a').attr('href') || '';
77 const brandEl = titleEl.find('span[class*="Brand_title__brand__"]');
78 const brand = brandEl.text().trim();
79
80
81 const prevPrice = $item.find('span[data-testid="product-card-price-prev"]').first().text().trim();
82 const nowPrice = $item.find('span[data-testid="product-card-price-now"]').last().text().trim();
83
84
85 const colourButtons = $item.find('button[class*="ColourSwatches_button__"]');
86 const colours = colourButtons.map((i, btn) => {
87 return $(btn).attr('title');
88 }).get();
89
90 return {
91 uniqueId: productId,
92 productTitle: titleText,
93 brand: brand,
94 prevPrice: prevPrice,
95 nowPrice: nowPrice,
96 imageUrl: imageUrl,
97 productUrl: productUrl ? `https://www.johnlewis.com${productUrl}` : '',
98 colours: colours
99 };
100 }).get();
101
102
103
104 return items;
105 }
106
107 async run(input) {
108 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
109 this.maxItems = maxItems;
110 this.searchUrl = searchUrl;
111 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
112 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
113 const proxyOptions = this.buildProxyOptions(proxyUrl);
114
115 const realBrowserOption = {
116 args: ["--start-maximized"],
117 turnstile: SCRAPER_CONFIG.turnstile,
118 headless: SCRAPER_CONFIG.headless,
119
120
121 customConfig: {},
122 connectOption: {
123 defaultViewport: null
124 },
125 ...(proxyOptions ? { proxy: proxyOptions } : {}),
126 plugins: []
127 }
128
129 const { page, browser } = await connectRealBrowser(realBrowserOption);
130
131 try {
132 page.setDefaultNavigationTimeout(300 * 1000);
133 page.setDefaultTimeout(3600 * 1000);
134
135
136
137
138
139 await this.handlePage(page, this.searchUrl);
140 } finally {
141 await browser?.close();
142 }
143 }
144
145 async handlePage(page, url) {
146 console.log(`Processing ${url}...`);
147 let count = 0;
148 let endOfResults = false;
149 let interceptedData = [];
150 let lastDataLength = 0;
151 let noNewDataCount = 0;
152
153
154 await page.goto(url);
155
156
157 if (SCRAPER_CONFIG.waitSelector) {
158 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
159 console.log(`data not found...`);
160 });
161 }
162
163
164 const firstPageHtml = await page.content();
165 await this.processResponseData(firstPageHtml, interceptedData);
166
167
168 while (!endOfResults && interceptedData.length < this.maxItems) {
169 count++;
170 console.log(`data fetching...${count}`);
171 await this.randomDelay();
172
173
174 await SCRAPER_CONFIG.autoScrollPage(page);
175 await this.randomDelay(3000, 5000);
176
177
178 if (SCRAPER_CONFIG.waitSelector) {
179 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
180 console.log(`data not found...`);
181 });
182 }
183
184
185 const afterScrollHtml = await page.content();
186 await this.processResponseData(afterScrollHtml, interceptedData);
187
188 if (interceptedData.length === lastDataLength) {
189 noNewDataCount++;
190 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
191 endOfResults = true;
192 }
193 } else {
194 noNewDataCount = 0;
195 lastDataLength = interceptedData.length;
196 }
197 }
198
199 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
200 }
201
202
203 async setupInterceptors(page, interceptedData) {
204 await page.setRequestInterception(true);
205 page.on('request', (request) => {
206 const resourceType = request.resourceType();
207 const blockedResourceTypes = [
208 'image',
209 'media',
210 'font',
211 'texttrack',
212 'stylesheet',
213 'ping'
214 ];
215
216 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
217 request.abort();
218 } else {
219 request.continue();
220 }
221 });
222 page.on('response', async (response) => {
223
224 });
225 }
226
227
228 async processResponseData(htmlContent, interceptedData) {
229 let currentData = [];
230
231 if (!htmlContent) {
232 return;
233 }
234
235 const $ = cheerio.load(htmlContent);
236
237
238 const items = this.parsePageItems($);
239
240 if (!items || items.length === 0) {
241 return;
242 }
243
244
245 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
246 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
247
248 let skippedCount = 0;
249 for (const listing of items) {
250
251 if (existingIds.has(listing[uniqueField])) {
252 skippedCount++;
253 continue;
254 }
255
256 const data = {
257 searchUrl: this.searchUrl,
258 ...listing,
259 scrapedAt: new Date().toISOString()
260 };
261
262 interceptedData.push(data);
263 currentData.push(data);
264 existingIds.add(listing[uniqueField]);
265
266
267 if (interceptedData.length >= this.maxItems) {
268 break;
269 }
270 }
271
272
273
274 if (currentData.length > 0) {
275 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
276 await Actor.pushData(currentData);
277 }
278 }
279
280
281 async randomDelay(min = 1000, max = 3000) {
282 const delay = Math.floor(Math.random() * (max - min + 1) + min);
283 await new Promise(resolve => setTimeout(resolve, delay));
284 }
285
286
287 async setCookies(page) {
288
289
290
291
292
293
294
295
296
297
298
299 }
300
301
302 buildProxyOptions(proxyUrl) {
303 if (!proxyUrl) {
304 return undefined;
305 }
306
307 try {
308 const parsed = new URL(proxyUrl);
309 if (!parsed.hostname || !parsed.port) {
310 return undefined;
311 }
312
313 return {
314 host: parsed.hostname,
315 port: Number(parsed.port),
316 username: parsed.username || undefined,
317 password: parsed.password || undefined,
318 };
319 } catch (error) {
320 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
321 return undefined;
322 }
323 }
324
325}
326
327
328
329
330
331await Actor.init();
332
333Actor.main(async () => {
334 let input = await Actor.getInput();
335 if (!input) {
336 input = {
337 "searchUrl": SCRAPER_CONFIG.requestUrl,
338 "maxItems": SCRAPER_CONFIG.maxItems
339 };
340 }
341 const crawler = new BaseScraper();
342 await crawler.run(input);
343});