1
2import { Actor } from 'apify';
3import { PuppeteerCrawler, log } from 'crawlee';
4import puppeteer from 'puppeteer-extra';
5import StealthPlugin from 'puppeteer-extra-plugin-stealth';
6import {
7 randomUserAgent,
8 randomDelay,
9 formatPrice,
10 extractCurrency,
11 normalizeText,
12 extractListingId,
13} from './utils.js';
14import { createBaseRowIntegration } from './baserow.js';
15
16
17puppeteer.use(StealthPlugin());
18
19
20await Actor.init();
21
22
23const input = await Actor.getInput() || {};
24const {
25 startUrls = [{ url: 'https://www.sahibinden.com/satilik-daire/istanbul?sorting=date_desc' }],
26 maxItems = null,
27 maxConcurrency = 3,
28 proxyConfiguration = {
29 useApifyProxy: true,
30 apifyProxyGroups: ['RESIDENTIAL'],
31 countryCode: 'TR',
32 },
33 baseRowApiToken,
34 baseRowTableId,
35 baseRowDatabaseId,
36 sessionCookies = [],
37 debugMode = false,
38} = input;
39
40
41const finalProxyConfiguration = {
42 useApifyProxy: true,
43 apifyProxyGroups: ['RESIDENTIAL'],
44 countryCode: 'TR',
45 ...(proxyConfiguration || {}),
46};
47if (!finalProxyConfiguration.apifyProxyGroups || finalProxyConfiguration.apifyProxyGroups.length === 0) {
48 finalProxyConfiguration.apifyProxyGroups = ['RESIDENTIAL'];
49}
50if (!finalProxyConfiguration.countryCode) {
51 finalProxyConfiguration.countryCode = 'TR';
52}
53
54const proxyConfig = await Actor.createProxyConfiguration(finalProxyConfiguration);
55
56const cookieNames = (sessionCookies || []).map(c => c.name);
57const hasCfClearanceInput = cookieNames.includes('cf_clearance');
58const hasPxCookies = ['_px3', '_pxhd', '_pxvid', 'pxcts'].some(n => cookieNames.includes(n));
59log.info('Starting Sahibinden Emlak Scraper', {
60 startUrls: startUrls.map(u => typeof u === 'string' ? u : u.url),
61 maxItems,
62 maxConcurrency,
63 proxyGroups: finalProxyConfiguration.apifyProxyGroups,
64 countryCode: finalProxyConfiguration.countryCode,
65 sessionCookiesProvided: cookieNames.length,
66 cookieNames,
67 hasCfClearance: hasCfClearanceInput,
68 hasPerimeterXCookies: hasPxCookies,
69});
70if (!hasPxCookies) {
71 log.warning('No PerimeterX cookies provided (_px3, _pxhd, _pxvid, pxcts). ' +
72 'If sahibinden shows a "Basılı Tutun" challenge, the actor will try to hold it automatically. ' +
73 'For reliable bypass: visit sahibinden.com in your browser, solve the hold challenge, then export ALL cookies.');
74}
75
76if (proxyConfig) {
77 log.info('Using proxy configuration', {
78 type: proxyConfig.usesApifyProxy ? 'Apify Proxy' : 'Custom Proxies',
79 groups: finalProxyConfiguration.apifyProxyGroups,
80 country: finalProxyConfiguration.countryCode,
81 });
82} else {
83 log.warning('No proxy configuration specified. Sahibinden.com requires RESIDENTIAL proxy!');
84}
85
86
87let baseRowIntegration = null;
88try {
89 baseRowIntegration = await createBaseRowIntegration();
90} catch (error) {
91 log.warning('BaseRow integration initialization failed, continuing without it.', { error: error.message });
92}
93
94let scrapedItemsCount = 0;
95
96
97function isChallengedPage(html) {
98 return (
99 html.includes('Just a moment') ||
100 html.includes('Checking your browser') ||
101 html.includes('cf-browser-verification') ||
102 html.includes('challenge-platform') ||
103 html.includes('Güvenlik doğrulaması gerçekleştirme') ||
104 html.includes('Bir dakika lütfen') ||
105 html.includes('Uyumsuz tarayıcı eklentisi')
106 );
107}
108
109
110function isPxHoldChallenge(html) {
111 return (
112 html.includes('Basılı Tutun') ||
113 html.includes('px-captcha') ||
114 html.includes('_pxCaptcha') ||
115 html.includes('PerimeterX') ||
116 html.includes('Bağlantınız kontrol ediliyor') ||
117 html.includes('human-challenge')
118 );
119}
120
121
122async function tryHoldPxButton(page) {
123 try {
124
125 const selectors = [
126 '#px-captcha',
127 '.px-captcha-container',
128 'div[id^="px-captcha"]',
129 'button',
130 ];
131
132 let holdTarget = null;
133 for (const sel of selectors) {
134 holdTarget = await page.$(sel).catch(() => null);
135 if (holdTarget) {
136 log.info(`Found PX hold target with selector: ${sel}`);
137 break;
138 }
139 }
140
141 if (!holdTarget) {
142 log.warning('Could not find PX hold button element.');
143 return false;
144 }
145
146 const box = await holdTarget.boundingBox();
147 if (!box) {
148 log.warning('PX hold button has no bounding box (not visible?)');
149 return false;
150 }
151
152 const cx = box.x + box.width / 2;
153 const cy = box.y + box.height / 2;
154
155 log.info(`Attempting PX hold at (${Math.round(cx)}, ${Math.round(cy)}) for 10s...`);
156
157
158 await page.mouse.move(cx - 50, cy - 30);
159 await new Promise(r => setTimeout(r, 300));
160 await page.mouse.move(cx, cy, { steps: 10 });
161 await new Promise(r => setTimeout(r, 200));
162
163
164 await page.mouse.down();
165 await new Promise(r => setTimeout(r, 10000));
166 await page.mouse.up();
167
168 log.info('Released PX hold button, waiting for redirect...');
169 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 }).catch(() => {});
170
171 const afterHtml = await page.content();
172 if (isPxHoldChallenge(afterHtml) || isChallengedPage(afterHtml)) {
173 log.warning('PX hold did not resolve the challenge.');
174 return false;
175 }
176
177 log.info('PX hold challenge resolved successfully!');
178 return true;
179 } catch (e) {
180 log.warning(`PX hold attempt error: ${e.message}`);
181 return false;
182 }
183}
184
185let debugCounter = 0;
186async function saveDebugInfo(page, label) {
187 if (!debugMode) return;
188 const idx = ++debugCounter;
189 const key = `DEBUG-${String(idx).padStart(3, '0')}-${label}`;
190 try {
191 const screenshot = await page.screenshot({ fullPage: true, type: 'png' });
192 await Actor.setValue(`${key}-screenshot`, screenshot, { contentType: 'image/png' });
193 log.info(`[DEBUG] Screenshot saved → KV store key: "${key}-screenshot"`);
194 } catch (e) {
195 log.warning(`[DEBUG] Could not save screenshot: ${e.message}`);
196 }
197 try {
198 const html = await page.content();
199 await Actor.setValue(`${key}-html`, html, { contentType: 'text/html' });
200 log.info(`[DEBUG] HTML saved → KV store key: "${key}-html" (${html.length} chars)`);
201 } catch (e) {
202 log.warning(`[DEBUG] Could not save HTML: ${e.message}`);
203 }
204 try {
205 const cookies = await page.cookies();
206 const cookieSummary = cookies.map(c => `${c.name}=${c.value.substring(0, 20)}... (expires: ${c.expires})`);
207 log.info(`[DEBUG] Cookies at "${label}":`, { cookies: cookieSummary });
208 } catch (e) { }
209}
210
211
212const crawler = new PuppeteerCrawler({
213 proxyConfiguration: proxyConfig,
214 maxConcurrency: maxConcurrency,
215 maxRequestsPerCrawl: maxItems ? maxItems * 3 : 1000,
216 maxRequestRetries: 8,
217 navigationTimeoutSecs: 90,
218 requestHandlerTimeoutSecs: 180,
219
220 useSessionPool: true,
221 persistCookiesPerSession: true,
222 sessionPoolOptions: {
223 maxPoolSize: 10,
224 sessionOptions: {
225 maxUsageCount: 50,
226 },
227 },
228
229 browserPoolOptions: {
230 retireBrowserAfterPageCount: 20,
231 },
232
233 launchContext: {
234 launcher: puppeteer,
235 launchOptions: {
236 headless: process.env.HEADLESS !== 'false',
237 args: [
238 '--no-sandbox',
239 '--disable-setuid-sandbox',
240 '--disable-dev-shm-usage',
241 '--disable-blink-features=AutomationControlled',
242 '--disable-infobars',
243 '--window-size=1920,1080',
244 '--start-maximized',
245 '--disable-features=IsolateOrigins,site-per-process',
246 '--disable-site-isolation-trials',
247 ],
248 ignoreDefaultArgs: ['--enable-automation'],
249 },
250 useChrome: true,
251 },
252
253 preNavigationHooks: [
254 async ({ page, request, session }, gotoOptions) => {
255
256
257
258
259 const nowSecs = Date.now() / 1000;
260 const alreadyInjected = session?.userData?.cookiesInjected === true;
261 if (!alreadyInjected && sessionCookies && Array.isArray(sessionCookies) && sessionCookies.length > 0) {
262 try {
263 const validCookies = sessionCookies
264
265 .map(c => ({ ...c, name: c.name ?? c.key ?? null }))
266 .filter(c => {
267 if (!c.name) {
268 log.debug('Skipping cookie with null/missing name field.');
269 return false;
270 }
271 const expiry = c.expirationDate ?? c.expires ?? null;
272 if (expiry && expiry < nowSecs) {
273 log.debug(`Skipping expired cookie: ${c.name}`);
274 return false;
275 }
276 return true;
277 });
278
279 if (validCookies.length < sessionCookies.length) {
280 log.warning(`Filtered ${sessionCookies.length - validCookies.length} invalid/expired cookies. ` +
281 `If cf_clearance expired, the scraper will earn a fresh one via session pre-warm.`);
282 }
283
284 if (validCookies.length > 0) {
285 const formattedCookies = validCookies.map(c => ({
286 name: c.name,
287 value: c.value,
288 domain: c.domain || '.sahibinden.com',
289 path: c.path || '/',
290 secure: c.secure !== false,
291 httpOnly: c.httpOnly === true,
292 sameSite: c.sameSite === 'no_restriction' ? 'None' : (c.sameSite || 'Lax'),
293 }));
294 await page.setCookie(...formattedCookies);
295 log.info(`Injected ${formattedCookies.length} valid session cookies: ${formattedCookies.map(c => c.name).join(', ')}`);
296 if (session) session.userData = { ...session.userData, cookiesInjected: true };
297 } else {
298 log.warning('All provided sessionCookies were expired — none injected. Please export fresh cookies from your browser.');
299 }
300 } catch (e) {
301 log.warning(`Failed to inject session cookies: ${e.message}`);
302 }
303 } else if (alreadyInjected) {
304 log.debug('Session already has cookies from previous request — skipping re-injection to preserve fresh CF/PX cookies.');
305 }
306
307
308 const allCurrentCookies = await page.cookies('https://www.sahibinden.com').catch(() => []);
309 const cfClearanceCookie = allCurrentCookies.find(c => c.name === 'cf_clearance');
310 const hasValidCfClearance = cfClearanceCookie
311 ? (!cfClearanceCookie.expires || cfClearanceCookie.expires === -1 || cfClearanceCookie.expires > nowSecs)
312 : false;
313
314 if (cfClearanceCookie) {
315 const expiry = cfClearanceCookie.expires;
316 const expiresIn = expiry && expiry !== -1 ? Math.round((expiry - nowSecs) / 60) : null;
317 log.info(`cf_clearance cookie found. expires in: ${expiresIn !== null ? expiresIn + ' min' : 'session'}, valid: ${hasValidCfClearance}`);
318 } else {
319 log.info('No cf_clearance cookie present in page context.');
320 }
321
322
323
324 if (!hasValidCfClearance && !session?.userData?.warmedUp) {
325 log.info('No valid cf_clearance found — pre-warming session via homepage...');
326 try {
327 await page.goto('https://www.sahibinden.com', {
328 waitUntil: 'networkidle2',
329 timeout: 60000,
330 });
331 const warmContent = await page.content();
332 if (isChallengedPage(warmContent)) {
333 log.info('CF challenge on homepage during pre-warm, waiting for auto-resolution...');
334 await saveDebugInfo(page, 'prewarm-challenge');
335 try {
336 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
337 const afterWarm = await page.content();
338 if (isChallengedPage(afterWarm)) {
339 log.warning('Pre-warm CF challenge did not resolve. Will attempt target URL anyway.');
340 await saveDebugInfo(page, 'prewarm-challenge-unresolved');
341 } else {
342 log.info('Pre-warm CF challenge resolved.');
343 }
344 } catch (e) {
345 log.warning(`Pre-warm CF challenge wait timed out: ${e.message}`);
346 }
347 } else {
348 log.info('Pre-warm homepage loaded successfully (no challenge).');
349 }
350 if (session) session.userData = { ...session.userData, warmedUp: true };
351 await randomDelay(1500, 3000);
352 } catch (e) {
353 log.warning(`Session pre-warm failed: ${e.message}`);
354 }
355 } else if (hasValidCfClearance) {
356 log.debug('Valid cf_clearance present, skipping pre-warm.');
357 }
358
359
360
361
362 let ua = session?.userData?.userAgent;
363 if (!ua) {
364 ua = randomUserAgent();
365 if (session) session.userData = { ...session.userData, userAgent: ua };
366 log.debug(`Assigned user agent for session: ${ua}`);
367 }
368 await page.setUserAgent(ua);
369
370
371
372 const chromeVerMatch = ua.match(/Chrome\/(\d+)/);
373
374
375
376
377 const requestLabel = request.userData?.label;
378 const isDetailPage = requestLabel === 'DETAIL';
379 const sourceUrl = request.userData?.listingData?.sourceUrl;
380
381 const extraHeaders = {
382 'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
383 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
384 'Sec-Fetch-Dest': 'document',
385 'Sec-Fetch-Mode': 'navigate',
386 'Sec-Fetch-Site': isDetailPage ? 'same-origin' : 'none',
387 'Sec-Fetch-User': '?1',
388 'Upgrade-Insecure-Requests': '1',
389 };
390 if (isDetailPage && sourceUrl) {
391 extraHeaders['Referer'] = sourceUrl;
392 }
393 if (chromeVerMatch) {
394 const v = chromeVerMatch[1];
395 extraHeaders['sec-ch-ua'] = `"Not A(Brand";v="99", "Google Chrome";v="${v}", "Chromium";v="${v}"`;
396 extraHeaders['sec-ch-ua-mobile'] = '?0';
397 extraHeaders['sec-ch-ua-platform'] = '"Windows"';
398 }
399 await page.setExtraHTTPHeaders(extraHeaders);
400
401
402 if (isDetailPage) {
403 await randomDelay(4000, 8000);
404 }
405
406 await page.setViewport({ width: 1920, height: 1080 });
407
408
409 await page.evaluateOnNewDocument(() => {
410 Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
411
412 window.chrome = {
413 runtime: {},
414 loadTimes: function () { },
415 csi: function () { },
416 app: {},
417 };
418
419 const originalQuery = window.navigator.permissions.query;
420 window.navigator.permissions.query = parameters => (
421 parameters.name === 'notifications'
422 ? Promise.resolve({ state: Notification.permission })
423 : originalQuery(parameters)
424 );
425
426
427 Object.defineProperty(navigator, 'languages', { get: () => ['tr-TR', 'tr', 'en-US', 'en'] });
428 Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
429 Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
430 Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
431 Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
432
433
434 Object.defineProperty(screen, 'width', { get: () => 1920 });
435 Object.defineProperty(screen, 'height', { get: () => 1080 });
436 Object.defineProperty(screen, 'availWidth', { get: () => 1920 });
437 Object.defineProperty(screen, 'availHeight', { get: () => 1040 });
438 Object.defineProperty(screen, 'colorDepth', { get: () => 24 });
439 Object.defineProperty(screen, 'pixelDepth', { get: () => 24 });
440
441
442 Object.defineProperty(navigator, 'plugins', {
443 get: () => {
444 const plugins = [
445 { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
446 { name: 'Chrome PDF Viewer', filename: 'mhjimihiapuabedfglidnhagcfenogec', description: '' },
447 { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
448 ];
449 plugins.item = (i) => plugins[i];
450 plugins.namedItem = (name) => plugins.find(p => p.name === name);
451 plugins.refresh = () => { };
452 plugins[Symbol.iterator] = function* () { yield* Object.values(plugins); };
453 Object.setPrototypeOf(plugins, PluginArray.prototype);
454 return plugins;
455 },
456 });
457
458
459 const getParameter = WebGLRenderingContext.prototype.getParameter;
460 WebGLRenderingContext.prototype.getParameter = function (parameter) {
461 if (parameter === 37445) return 'Intel Inc.';
462 if (parameter === 37446) return 'Intel Iris OpenGL Engine';
463 return getParameter.call(this, parameter);
464 };
465 });
466
467 if (gotoOptions) {
468 gotoOptions.waitUntil = 'networkidle2';
469 gotoOptions.timeout = 90000;
470 }
471 },
472 ],
473
474 postNavigationHooks: [
475 async ({ page, response, request, session }) => {
476 const statusCode = response?.status();
477 log.info(`Response status: ${statusCode} for ${request.url}`);
478
479 if (statusCode === 403 || statusCode === 503 || statusCode === 429) {
480 log.warning(`Got ${statusCode} for ${request.url} — checking page content...`);
481 await saveDebugInfo(page, `${statusCode}-initial`);
482
483 await randomDelay(5000, 10000);
484
485 try {
486 await page.mouse.move(100 + Math.random() * 500, 100 + Math.random() * 500);
487 await page.mouse.move(200 + Math.random() * 500, 200 + Math.random() * 500);
488 } catch (e) { }
489
490 const content = await page.content();
491
492 if (isPxHoldChallenge(content)) {
493
494 log.info('PerimeterX hold challenge detected — attempting automated hold...');
495 await saveDebugInfo(page, `${statusCode}-px-hold`);
496 const pxSolved = await tryHoldPxButton(page);
497 if (!pxSolved) {
498 log.warning('PerimeterX hold challenge failed. Provide _px3/_pxhd/_pxvid/pxcts cookies from your browser to bypass this.');
499 await saveDebugInfo(page, `${statusCode}-px-hold-failed`);
500 if (session) session.markBad();
501 throw new Error('PerimeterX hold challenge not resolved');
502 }
503 } else if (isChallengedPage(content)) {
504 log.info('Cloudflare challenge page detected, waiting for auto-resolution...');
505 await saveDebugInfo(page, `${statusCode}-challenge`);
506 try {
507 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 45000 });
508
509
510 const resolvedContent = await page.content();
511 if (isPxHoldChallenge(resolvedContent)) {
512 log.info('CF resolved but now hit PX hold challenge — attempting hold...');
513 await saveDebugInfo(page, `${statusCode}-cf-then-px`);
514 const pxSolved = await tryHoldPxButton(page);
515 if (!pxSolved) {
516 if (session) session.markBad();
517 throw new Error('PerimeterX hold challenge not resolved after CF');
518 }
519 } else if (isChallengedPage(resolvedContent)) {
520 log.warning('Cloudflare challenge navigated to another challenge page. Marking session bad.');
521 await saveDebugInfo(page, `${statusCode}-challenge-still-blocked`);
522 if (session) session.markBad();
523 throw new Error('Cloudflare Turnstile challenge requires manual verification');
524 } else {
525 log.info('Cloudflare challenge resolved!');
526 }
527 } catch (e) {
528 if (e.message.includes('Turnstile') || e.message.includes('PerimeterX')) throw e;
529 log.warning('Cloudflare challenge did not resolve in time. Retrying...');
530 await saveDebugInfo(page, `${statusCode}-challenge-timeout`);
531 if (session) session.markBad();
532 throw new Error('Cloudflare challenge timeout');
533 }
534 } else {
535 log.warning('Received 403 without recognized challenge page. Marking session as bad.');
536 await saveDebugInfo(page, `${statusCode}-unknown-block`);
537 if (session) session.markBad();
538 throw new Error(`Blocked with status ${statusCode}`);
539 }
540 }
541
542
543 if (page.url().includes('/cs/tloading')) {
544 log.info('Detected tloading protection page, waiting for JS redirect...');
545 try {
546 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
547 log.info(`tloading resolved, now at: ${page.url()}`);
548 } catch (e) {
549 log.warning('tloading page did not redirect in time. Marking session bad and retrying.');
550 if (session) session.markBad();
551 throw new Error('tloading page did not resolve');
552 }
553 }
554
555
556 if (page.url().includes('/giris') || page.url().includes('secure.sahibinden.com')) {
557 log.error('Redirected to login page. Your session cookies are missing or expired.');
558 if (session) session.markBad();
559 throw new Error('Mandatory login required. Please update the sessionCookies input.');
560 }
561
562 if (statusCode && statusCode >= 200 && statusCode < 300) {
563 if (session) session.markGood();
564 }
565 },
566 ],
567
568 requestHandler: async ({ page, request, enqueueLinks }) => {
569 const label = request.userData?.label || 'CATEGORY';
570 log.info(`Processing page [${label}]: ${request.url}`);
571
572 await randomDelay(2000, 5000);
573
574 try {
575 await page.waitForSelector('body', { timeout: 45000 });
576
577 await handleCategoryPage(page, request, enqueueLinks);
578
579 } catch (error) {
580 const errorMessage = error instanceof Error ? error.message : String(error);
581 log.error(`Error processing ${request.url}: ${errorMessage}`, {
582 stack: error instanceof Error ? error.stack : undefined,
583 });
584 throw error;
585 }
586 },
587
588 failedRequestHandler: async ({ request }) => {
589 log.error(`Request failed after retries: ${request.url}`, {
590 errors: request.errorMessages,
591 });
592 },
593});
594
595
596const originalThrowOnBlocked = crawler._throwOnBlockedRequest?.bind(crawler);
597if (originalThrowOnBlocked) {
598 crawler._throwOnBlockedRequest = function (session, statusCode) {
599 if (statusCode === 403 || statusCode === 503) {
600 log.debug(`Suppressing Crawlee's built-in ${statusCode} block check (handled by postNavigationHook)`);
601 return;
602 }
603 return originalThrowOnBlocked(session, statusCode);
604 };
605 log.info('Overridden Crawlee blocked request check for Cloudflare compatibility');
606}
607
608
609
610
611async function handleCategoryPage(page, request, enqueueLinks) {
612 log.info(`Handling category page: ${request.url}`);
613
614 const listingRowSelector = 'tbody.searchResultsRowClass > tr.searchResultsItem';
615 const titleLinkSelector = 'td.searchResultsTitleValue a.classifiedTitle';
616 const priceSelector = 'td.searchResultsPriceValue span';
617 const pricePerSqmSelector = 'td.searchResultsPriceValue:nth-of-type(2)';
618 const areaSelector = 'td.searchResultsAttributeValue';
619 const dateSelector = 'td.searchResultsDateValue';
620 const locationSelector = 'td.searchResultsLocationValue';
621 const nextPageSelector = 'a.prevNextBut[title="Sonraki"]:not(.passive)';
622
623 try {
624 let listingElements = [];
625 try {
626 await page.waitForSelector(listingRowSelector, { timeout: 15000 });
627 listingElements = await page.$$(listingRowSelector);
628 if (debugMode) await saveDebugInfo(page, 'category-loaded');
629 } catch (e) {
630 log.warning(`Primary selector failed: ${listingRowSelector}`);
631
632 const pageTitle = await page.title().catch(() => 'unknown');
633 const currentUrl = page.url();
634 log.info('Page state when selector failed:', { title: pageTitle, url: currentUrl });
635 await saveDebugInfo(page, 'category-selector-failed');
636
637 const alternativeSelectors = [
638 'table.searchResultsTable tr.searchResultsItem',
639 '.searchResultsRowClass .searchResultsItem',
640 'tr.searchResultsItem',
641 '.classified-list-item',
642 '[data-id]',
643 '.searchResults .result-item',
644 'table tr[data-id]',
645 ];
646
647 for (const altSelector of alternativeSelectors) {
648 const altElements = await page.$$(altSelector);
649 if (altElements.length > 0) {
650 log.info(`Found ${altElements.length} elements with alternative selector: ${altSelector}`);
651 listingElements = altElements;
652 break;
653 }
654 }
655
656 if (listingElements.length === 0) {
657 const tableCount = await page.$$eval('table', tables => tables.length).catch(() => 0);
658 const trCount = await page.$$eval('tr', rows => rows.length).catch(() => 0);
659 const tbodyCount = await page.$$eval('tbody', bodies => bodies.length).catch(() => 0);
660 log.info('DEBUG: Page structure', { tables: tableCount, rows: trCount, tbodies: tbodyCount });
661
662 const searchClasses = await page.evaluate(() => {
663 const allElements = document.querySelectorAll('*');
664 const classes = new Set();
665 allElements.forEach(el => {
666 if (el.className && typeof el.className === 'string') {
667 el.className.split(' ').forEach(cls => {
668 if (cls.toLowerCase().includes('search') || cls.toLowerCase().includes('result') || cls.toLowerCase().includes('listing') || cls.toLowerCase().includes('classified')) {
669 classes.add(cls);
670 }
671 });
672 }
673 });
674 return Array.from(classes);
675 }).catch(() => []);
676 log.info('DEBUG: Relevant CSS classes found:', { classes: searchClasses });
677
678 throw new Error('No listing elements found with any selector');
679 }
680 }
681
682 log.info(`Found ${listingElements.length} listings on page.`);
683
684 const results = [];
685
686 for (const element of listingElements) {
687 if (maxItems !== null && scrapedItemsCount >= maxItems) {
688 log.info(`Maximum items limit (${maxItems}) reached. Stopping scrape.`);
689 if (results.length > 0) {
690 await Actor.pushData(results);
691 if (baseRowIntegration) {
692 try { await baseRowIntegration.storeListings(results); } catch (error) {
693 log.warning('Failed to store data in BaseRow', { error: error.message });
694 }
695 }
696 }
697 await crawler.autoscaledPool?.abort();
698 return;
699 }
700
701 try {
702 const titleElement = await element.$(titleLinkSelector);
703 const title = await titleElement?.evaluate(el => el.textContent?.trim()).catch(() => null);
704 const detailUrl = await titleElement?.evaluate(el => el.href).catch(() => null);
705
706 if (!title || !detailUrl) {
707 log.debug('Skipping row due to missing title or detailUrl.');
708 continue;
709 }
710
711 const priceText = await element.$eval(priceSelector, el => el.textContent?.trim()).catch(() => null);
712 const pricePerSqmText = await element.$eval(pricePerSqmSelector, el => el.textContent?.trim()).catch(() => null);
713 const areaText = await element.$eval(areaSelector, el => el.textContent?.trim()).catch(() => null);
714 const location = await element.$eval(locationSelector, el => el.innerText?.trim().replace(/\n/g, ' / ')).catch(() => null);
715 const date = await element.$eval(dateSelector, el => el.innerText?.trim().replace(/\n/g, ' ')).catch(() => null);
716 const image = await element.$eval('img', el => el.src || el.dataset?.src || null).catch(() => null);
717
718 const id = await element.evaluate(el => el.getAttribute('data-id')).catch(() => null)
719 ?? extractListingId(detailUrl);
720
721 const listingData = {
722 id,
723 url: detailUrl,
724 title: normalizeText(title),
725 price: formatPrice(priceText),
726 price_currency: extractCurrency(priceText),
727 price_raw: priceText,
728 price_per_sqm: normalizeText(pricePerSqmText),
729 area: normalizeText(areaText),
730 location: normalizeText(location),
731 date: normalizeText(date),
732 image,
733 scrapedAt: new Date().toISOString(),
734 sourceUrl: request.url,
735 };
736
737 results.push(listingData);
738 scrapedItemsCount++;
739
740 } catch (extractError) {
741 const errorMsg = extractError instanceof Error ? extractError.message : String(extractError);
742 log.warning(`Could not process one item on ${request.url}`, { error: errorMsg });
743 }
744 }
745
746 if (results.length > 0) {
747 await Actor.pushData(results);
748 log.info(`Pushed ${results.length} listings from page. Total scraped: ${scrapedItemsCount}`);
749 if (baseRowIntegration) {
750 try { await baseRowIntegration.storeListings(results); } catch (error) {
751 log.warning('Failed to store data in BaseRow', { error: error.message });
752 }
753 }
754 } else {
755 log.info(`No listings extracted from page ${request.url}.`);
756 }
757
758
759
760 if (maxItems === null || scrapedItemsCount < maxItems) {
761 const nextPageUrl = await page.$eval(nextPageSelector, anchor => anchor.href).catch(() => null);
762 if (nextPageUrl) {
763 log.info(`Enqueueing next category page: ${nextPageUrl}`);
764 const absoluteNextPageUrl = new URL(nextPageUrl, request.loadedUrl || request.url).toString();
765 await enqueueLinks({ urls: [absoluteNextPageUrl], userData: { label: 'CATEGORY' } });
766 } else {
767 log.info(`No next page button found on ${request.url}`);
768 }
769 }
770
771 if (maxItems !== null && scrapedItemsCount >= maxItems) {
772 log.info(`Maximum items limit (${maxItems}) reached after detail processing.`);
773 await crawler.autoscaledPool?.abort();
774 }
775
776 } catch (error) {
777 const errorMessage = error instanceof Error ? error.message : String(error);
778 log.warning(`Could not handle category page ${request.url}: ${errorMessage}`);
779 }
780}
781
782
783
784
785const startRequests = (Array.isArray(startUrls) ? startUrls : [startUrls]).map(item => {
786 let urlString;
787 if (typeof item === 'string') {
788 urlString = item;
789 } else if (item && typeof item.url === 'string') {
790 urlString = item.url;
791 } else {
792 log.warning('Skipping invalid start URL item:', { item });
793 return null;
794 }
795
796 if (!urlString || !urlString.startsWith('http')) {
797 log.warning('Skipping item with invalid URL string:', { urlString });
798 return null;
799 }
800
801 const isDetailUrl = urlString.includes('/ilan/') && urlString.includes('/detay');
802 return { url: urlString, userData: { label: isDetailUrl ? 'DETAIL' : 'CATEGORY' } };
803}).filter(req => req !== null);
804
805if (startRequests.length > 0) {
806 await crawler.addRequests(startRequests);
807 log.info(`Added ${startRequests.length} initial requests to the queue.`);
808} else {
809 log.warning('No valid start URLs found in the input. Exiting.');
810 await Actor.exit(1, 'No valid start URLs provided.');
811}
812
813log.info('Starting the crawler...');
814await crawler.run();
815log.info(`Crawler finished. Total items scraped: ${scrapedItemsCount}`);
816
817await Actor.exit();