1import { Actor } from 'apify';
2import { connect as connectRealBrowser } from 'puppeteer-real-browser';
3import * as cheerio from 'cheerio';
4
5
6
7
8const SCRAPER_CONFIG = {
9
10 maxNoNewDataCount: 3,
11
12 headless: false,
13
14 turnstile: true,
15
16 filterResources: false,
17
18 maxItems: 50,
19
20 requestUrl: 'https://www.noon.com/uae-en/search/?q=iphone+17+pro',
21
22 uniqueIdField: 'productId',
23
24 waitSelector: '[data-qa="plp-product-box"]',
25
26 autoScrollPage: async (page) => {
27 await page.evaluate(async () => {
28 const nextButton = document.querySelector('a[aria-label="Next page"]');
29 if (nextButton) {
30 nextButton.click();
31 }
32 });
33 }
34};
35
36
37
38
39class BaseScraper {
40
41
42
43
44
45
46
47 parsePageItems($) {
48
49
50 const items = $(SCRAPER_CONFIG.waitSelector).map((i, el) => {
51
52
53
54
55 const $item = $(el);
56 const linkEl = $item.find('a[href*="/uae-en/"]').first();
57 const href = linkEl.attr('href') || '';
58 const productUrl = href ? `https://www.noon.com${href}` : '';
59 const productIdMatch = href.match(/N\d+V\/p\//);
60 const productId = productIdMatch ? productIdMatch[0].replace('/p/', '') : '';
61
62 const imageEl = $item.find('img').first();
63 const imageUrl = imageEl.attr('src') || imageEl.attr('data-src') || '';
64
65 const nameEl = $item.find('[data-qa="plp-product-box-name"]');
66 let productName = nameEl.text().trim();
67 if (!productName) {
68 const dataQa = imageEl.attr('data-qa') || '';
69 productName = dataQa.replace('productImagePLP_', '');
70 }
71
72 const priceEl = $item.find('div[class*="__sellingPrice"] > strong');
73 const price = priceEl.text().trim();
74
75 const discountEl = $item.find('div[class*="__discount"]');
76 const discount = discountEl.text().trim();
77
78 const deliveryEl = $item.find('[data-qa="product-noon-express"]');
79 const isExpress = deliveryEl.length > 0;
80
81 const deliveryDateEl = $item.find('.ProductDetailsSection-module-scss-module__estimatedDeliveryDateNewTextStyles');
82 const deliveryDate = deliveryDateEl.text().trim().replace(/\s+/g, ' ');
83
84 const deliveryTimeEl = $item.find('.FlyoutBadgeLarge-module-scss-module__estimationText');
85 const deliveryTime = deliveryTimeEl.text().trim().replace(/\s+/g, ' ');
86
87 return {
88 productId,
89 productName,
90 price,
91 imageUrl,
92 productUrl,
93 discount,
94 isExpress,
95 deliveryDate,
96 deliveryTime
97 };
98 }).get();
99
100
101
102 return items;
103 }
104
105 async run(input) {
106 const { searchUrl, maxItems = Infinity, proxyConfiguration } = input;
107 this.maxItems = maxItems;
108 this.searchUrl = searchUrl;
109 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
110 const proxyUrl = proxyConfig ? await proxyConfig.newUrl() : undefined;
111 const proxyOptions = this.buildProxyOptions(proxyUrl);
112
113 const realBrowserOption = {
114 args: ["--start-maximized"],
115 turnstile: SCRAPER_CONFIG.turnstile,
116 headless: SCRAPER_CONFIG.headless,
117
118
119 customConfig: {},
120 connectOption: {
121 defaultViewport: null
122 },
123 ...(proxyOptions ? { proxy: proxyOptions } : {}),
124 plugins: []
125 }
126
127 const { page, browser } = await connectRealBrowser(realBrowserOption);
128
129 try {
130 page.setDefaultNavigationTimeout(300 * 1000);
131 page.setDefaultTimeout(3600 * 1000);
132
133
134
135
136
137 await this.handlePage(page, this.searchUrl);
138 } finally {
139 await browser?.close();
140 }
141 }
142
143 async handlePage(page, url) {
144 console.log(`Processing ${url}...`);
145 let count = 0;
146 let endOfResults = false;
147 let interceptedData = [];
148 let lastDataLength = 0;
149 let noNewDataCount = 0;
150
151
152 await page.goto(url);
153
154
155 if (SCRAPER_CONFIG.waitSelector) {
156 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 60000 }).catch(() => {
157 console.log(`data not found...`);
158 });
159 }
160
161
162 const firstPageHtml = await page.content();
163 await this.processResponseData(firstPageHtml, interceptedData);
164
165
166 while (!endOfResults && interceptedData.length < this.maxItems) {
167 count++;
168 console.log(`data fetching...${count}`);
169 await this.randomDelay();
170
171
172 await SCRAPER_CONFIG.autoScrollPage(page);
173 await this.randomDelay(3000, 5000);
174
175
176 if (SCRAPER_CONFIG.waitSelector) {
177 await page.waitForSelector(SCRAPER_CONFIG.waitSelector, { timeout: 10000 }).catch(() => {
178 console.log(`data not found...`);
179 });
180 }
181
182
183 const afterScrollHtml = await page.content();
184 await this.processResponseData(afterScrollHtml, interceptedData);
185
186 if (interceptedData.length === lastDataLength) {
187 noNewDataCount++;
188 if (noNewDataCount >= SCRAPER_CONFIG.maxNoNewDataCount) {
189 endOfResults = true;
190 }
191 } else {
192 noNewDataCount = 0;
193 lastDataLength = interceptedData.length;
194 }
195 }
196
197 console.log(`Scraping completed. Total items: ${interceptedData.length}`);
198 }
199
200
201 async setupInterceptors(page, interceptedData) {
202 await page.setRequestInterception(true);
203 page.on('request', (request) => {
204 const resourceType = request.resourceType();
205 const blockedResourceTypes = [
206 'image',
207 'media',
208 'font',
209 'texttrack',
210 'stylesheet',
211 'ping'
212 ];
213
214 if (SCRAPER_CONFIG.filterResources && blockedResourceTypes.includes(resourceType)) {
215 request.abort();
216 } else {
217 request.continue();
218 }
219 });
220 page.on('response', async (response) => {
221
222 });
223 }
224
225
226 async processResponseData(htmlContent, interceptedData) {
227 let currentData = [];
228
229 if (!htmlContent) {
230 return;
231 }
232
233 const $ = cheerio.load(htmlContent);
234
235
236 const items = this.parsePageItems($);
237
238 if (!items || items.length === 0) {
239 return;
240 }
241
242
243 const uniqueField = SCRAPER_CONFIG.uniqueIdField;
244 const existingIds = new Set(interceptedData.map(item => item[uniqueField]));
245
246 let skippedCount = 0;
247 for (const listing of items) {
248
249 if (existingIds.has(listing[uniqueField])) {
250 skippedCount++;
251 continue;
252 }
253
254 const data = {
255 searchUrl: this.searchUrl,
256 ...listing,
257 scrapedAt: new Date().toISOString()
258 };
259
260 interceptedData.push(data);
261 currentData.push(data);
262 existingIds.add(listing[uniqueField]);
263
264
265 if (interceptedData.length >= this.maxItems) {
266 break;
267 }
268 }
269
270
271
272 if (currentData.length > 0) {
273 console.log(`Saved ${currentData.length} items, total: ${interceptedData.length}`);
274 await Actor.pushData(currentData);
275 }
276 }
277
278
279 async randomDelay(min = 1000, max = 3000) {
280 const delay = Math.floor(Math.random() * (max - min + 1) + min);
281 await new Promise(resolve => setTimeout(resolve, delay));
282 }
283
284
285 async setCookies(page) {
286
287
288
289
290
291
292
293
294
295
296
297 }
298
299
300 buildProxyOptions(proxyUrl) {
301 if (!proxyUrl) {
302 return undefined;
303 }
304
305 try {
306 const parsed = new URL(proxyUrl);
307 if (!parsed.hostname || !parsed.port) {
308 return undefined;
309 }
310
311 return {
312 host: parsed.hostname,
313 port: Number(parsed.port),
314 username: parsed.username || undefined,
315 password: parsed.password || undefined,
316 };
317 } catch (error) {
318 console.warn(`Invalid proxy URL detected: ${proxyUrl}`, error);
319 return undefined;
320 }
321 }
322
323}
324
325
326
327
328
329await Actor.init();
330
331Actor.main(async () => {
332 let input = await Actor.getInput();
333 if (!input) {
334 input = {
335 "searchUrl": SCRAPER_CONFIG.requestUrl,
336 "maxItems": SCRAPER_CONFIG.maxItems
337 };
338 }
339 const crawler = new BaseScraper();
340 await crawler.run(input);
341});