1
2import { Actor } from 'apify';
3import { PuppeteerCrawler, log } from 'crawlee';
4import puppeteer from 'puppeteer-extra';
5import StealthPlugin from 'puppeteer-extra-plugin-stealth';
6import {
7 randomUserAgent,
8 randomDelay,
9 formatPrice,
10 extractCurrency,
11 normalizeText,
12 extractListingId,
13} from './utils.js';
14import { createBaseRowIntegration } from './baserow.js';
15
16
17puppeteer.use(StealthPlugin());
18
19
20await Actor.init();
21
22
23const input = await Actor.getInput() || {};
24const {
25 startUrls = [{ url: 'https://www.sahibinden.com/vasita/otomobil?sorting=date_desc' }],
26 maxItems = null,
27 includeDetails = false,
28 maxConcurrency = 3,
29 proxyConfiguration = {
30 useApifyProxy: true,
31 apifyProxyGroups: ['RESIDENTIAL'],
32 countryCode: 'TR',
33 },
34
35 baseRowApiToken,
36 baseRowTableId,
37 baseRowDatabaseId,
38
39 sessionCookies = [],
40
41 debugMode = false,
42} = input;
43
44
45const finalProxyConfiguration = {
46 useApifyProxy: true,
47 apifyProxyGroups: ['RESIDENTIAL'],
48 countryCode: 'TR',
49 ...(proxyConfiguration || {}),
50};
51if (!finalProxyConfiguration.apifyProxyGroups || finalProxyConfiguration.apifyProxyGroups.length === 0) {
52 finalProxyConfiguration.apifyProxyGroups = ['RESIDENTIAL'];
53}
54if (!finalProxyConfiguration.countryCode) {
55 finalProxyConfiguration.countryCode = 'TR';
56}
57
58
59const proxyConfig = await Actor.createProxyConfiguration(finalProxyConfiguration);
60
61const cookieNames = (sessionCookies || []).map(c => c.name);
62const hasCfClearanceInput = cookieNames.includes('cf_clearance');
63const hasPxCookies = ['_px3', '_pxhd', '_pxvid', 'pxcts'].some(n => cookieNames.includes(n));
64
65log.info('Starting Sahibinden Car Scraper', {
66 startUrls: startUrls.map(u => typeof u === 'string' ? u : u.url),
67 maxItems,
68 includeDetails,
69 maxConcurrency,
70 proxyGroups: finalProxyConfiguration.apifyProxyGroups,
71 countryCode: finalProxyConfiguration.countryCode,
72 sessionCookiesProvided: cookieNames.length,
73 cookieNames,
74 hasCfClearance: hasCfClearanceInput,
75 hasPerimeterXCookies: hasPxCookies,
76});
77
78if (!hasPxCookies) {
79 log.warning('No PerimeterX cookies provided (_px3, _pxhd, _pxvid, pxcts). ' +
80 'If sahibinden shows a "Basılı Tutun" challenge, the actor will try to hold it automatically. ' +
81 'For reliable bypass: visit sahibinden.com in your browser, solve the hold challenge, then export ALL cookies.');
82}
83
84if (proxyConfig) {
85 log.info('Using proxy configuration', {
86 type: proxyConfig.usesApifyProxy ? 'Apify Proxy' : 'Custom Proxies',
87 groups: finalProxyConfiguration.apifyProxyGroups,
88 country: finalProxyConfiguration.countryCode,
89 });
90} else {
91 log.warning('No proxy configuration specified. Sahibinden.com requires RESIDENTIAL proxy!');
92}
93
94
95let baseRowIntegration = null;
96try {
97 baseRowIntegration = await createBaseRowIntegration();
98} catch (error) {
99 log.warning('BaseRow integration initialization failed, continuing without it.', { error: error.message });
100}
101
102let scrapedItemsCount = 0;
103
104
105
106
107
108
109function isChallengedPage(html) {
110 return (
111 html.includes('Just a moment') ||
112 html.includes('Checking your browser') ||
113 html.includes('cf-browser-verification') ||
114 html.includes('challenge-platform') ||
115 html.includes('Güvenlik doğrulaması gerçekleştirme') ||
116 html.includes('Bir dakika lütfen') ||
117 html.includes('Uyumsuz tarayıcı eklentisi')
118 );
119}
120
121
122function isPxHoldChallenge(html) {
123 return (
124 html.includes('Basılı Tutun') ||
125 html.includes('px-captcha') ||
126 html.includes('_pxCaptcha') ||
127 html.includes('PerimeterX') ||
128 html.includes('Bağlantınız kontrol ediliyor') ||
129 html.includes('human-challenge')
130 );
131}
132
133
134async function tryHoldPxButton(page) {
135 try {
136 const selectors = [
137 '#px-captcha',
138 '.px-captcha-container',
139 'div[id^="px-captcha"]',
140 'button',
141 ];
142
143 let holdTarget = null;
144 for (const sel of selectors) {
145 holdTarget = await page.$(sel).catch(() => null);
146 if (holdTarget) {
147 log.info(`Found PX hold target with selector: ${sel}`);
148 break;
149 }
150 }
151
152 if (!holdTarget) {
153 log.warning('Could not find PX hold button element.');
154 return false;
155 }
156
157 const box = await holdTarget.boundingBox();
158 if (!box) {
159 log.warning('PX hold button has no bounding box (not visible?)');
160 return false;
161 }
162
163 const cx = box.x + box.width / 2;
164 const cy = box.y + box.height / 2;
165
166 log.info(`Attempting PX hold at (${Math.round(cx)}, ${Math.round(cy)}) for 10s...`);
167
168
169 await page.mouse.move(cx - 50, cy - 30);
170 await new Promise(r => setTimeout(r, 300));
171 await page.mouse.move(cx, cy, { steps: 10 });
172 await new Promise(r => setTimeout(r, 200));
173
174
175 await page.mouse.down();
176 await new Promise(r => setTimeout(r, 10000));
177 await page.mouse.up();
178
179 log.info('Released PX hold button, waiting for redirect...');
180 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 }).catch(() => {});
181
182 const afterHtml = await page.content();
183 if (isPxHoldChallenge(afterHtml) || isChallengedPage(afterHtml)) {
184 log.warning('PX hold did not resolve the challenge.');
185 return false;
186 }
187
188 log.info('PX hold challenge resolved successfully!');
189 return true;
190 } catch (e) {
191 log.warning(`PX hold attempt error: ${e.message}`);
192 return false;
193 }
194}
195
196let debugCounter = 0;
197async function saveDebugInfo(page, label) {
198 if (!debugMode) return;
199 const idx = ++debugCounter;
200 const key = `DEBUG-${String(idx).padStart(3, '0')}-${label}`;
201 try {
202 const screenshot = await page.screenshot({ fullPage: true, type: 'png' });
203 await Actor.setValue(`${key}-screenshot`, screenshot, { contentType: 'image/png' });
204 log.info(`[DEBUG] Screenshot saved → KV store key: "${key}-screenshot"`);
205 } catch (e) {
206 log.warning(`[DEBUG] Could not save screenshot: ${e.message}`);
207 }
208 try {
209 const html = await page.content();
210 await Actor.setValue(`${key}-html`, html, { contentType: 'text/html' });
211 log.info(`[DEBUG] HTML saved → KV store key: "${key}-html" (${html.length} chars)`);
212 } catch (e) {
213 log.warning(`[DEBUG] Could not save HTML: ${e.message}`);
214 }
215 try {
216 const cookies = await page.cookies();
217 const cookieSummary = cookies.map(c => `${c.name}=${c.value.substring(0, 20)}... (expires: ${c.expires})`);
218 log.info(`[DEBUG] Cookies at "${label}":`, { cookies: cookieSummary });
219 } catch (e) { }
220}
221
222
223
224
225const crawler = new PuppeteerCrawler({
226 proxyConfiguration: proxyConfig,
227 maxConcurrency,
228 maxRequestsPerCrawl: maxItems ? maxItems * 3 : 1000,
229 maxRequestRetries: 8,
230 navigationTimeoutSecs: 90,
231 requestHandlerTimeoutSecs: 180,
232
233
234 useSessionPool: true,
235 persistCookiesPerSession: true,
236 sessionPoolOptions: {
237 maxPoolSize: 10,
238 sessionOptions: {
239 maxUsageCount: 50,
240 },
241 },
242
243 browserPoolOptions: {
244 retireBrowserAfterPageCount: 20,
245 },
246
247 launchContext: {
248 launcher: puppeteer,
249 launchOptions: {
250 headless: process.env.HEADLESS !== 'false',
251 args: [
252 '--no-sandbox',
253 '--disable-setuid-sandbox',
254 '--disable-dev-shm-usage',
255 '--disable-blink-features=AutomationControlled',
256 '--disable-infobars',
257 '--window-size=1920,1080',
258 '--start-maximized',
259 '--disable-features=IsolateOrigins,site-per-process',
260 '--disable-site-isolation-trials',
261 ],
262 ignoreDefaultArgs: ['--enable-automation'],
263 },
264 useChrome: true,
265 },
266
267 preNavigationHooks: [
268 async ({ page, request, session }, gotoOptions) => {
269
270
271
272
273 const nowSecs = Date.now() / 1000;
274 const alreadyInjected = session?.userData?.cookiesInjected === true;
275 if (!alreadyInjected && sessionCookies && Array.isArray(sessionCookies) && sessionCookies.length > 0) {
276 try {
277 const validCookies = sessionCookies.filter(c => {
278 const expiry = c.expirationDate ?? c.expires ?? null;
279 if (expiry && expiry < nowSecs) {
280 log.debug(`Skipping expired cookie: ${c.name}`);
281 return false;
282 }
283 return true;
284 });
285
286 if (validCookies.length < sessionCookies.length) {
287 log.warning(`Filtered ${sessionCookies.length - validCookies.length} expired cookies. ` +
288 `If cf_clearance expired, the scraper will earn a fresh one via session pre-warm.`);
289 }
290
291 if (validCookies.length > 0) {
292 const formattedCookies = validCookies.map(c => ({
293 name: c.name,
294 value: c.value,
295 domain: c.domain || '.sahibinden.com',
296 path: c.path || '/',
297 secure: c.secure !== false,
298 httpOnly: c.httpOnly === true,
299 sameSite: c.sameSite === 'no_restriction' ? 'None' : (c.sameSite || 'Lax'),
300 }));
301 await page.setCookie(...formattedCookies);
302 log.info(`Injected ${formattedCookies.length} valid session cookies: ${formattedCookies.map(c => c.name).join(', ')}`);
303 if (session) session.userData = { ...session.userData, cookiesInjected: true };
304 } else {
305 log.warning('All provided sessionCookies were expired — none injected. Please export fresh cookies from your browser.');
306 }
307 } catch (e) {
308 log.warning(`Failed to inject session cookies: ${e.message}`);
309 }
310 } else if (alreadyInjected) {
311 log.debug('Session already has cookies from previous request — skipping re-injection to preserve fresh CF/PX cookies.');
312 }
313
314
315 const allCurrentCookies = await page.cookies('https://www.sahibinden.com').catch(() => []);
316 const cfClearanceCookie = allCurrentCookies.find(c => c.name === 'cf_clearance');
317 const hasValidCfClearance = cfClearanceCookie
318 ? (!cfClearanceCookie.expires || cfClearanceCookie.expires === -1 || cfClearanceCookie.expires > nowSecs)
319 : false;
320
321 if (cfClearanceCookie) {
322 const expiry = cfClearanceCookie.expires;
323 const expiresIn = expiry && expiry !== -1 ? Math.round((expiry - nowSecs) / 60) : null;
324 log.info(`cf_clearance cookie found. expires in: ${expiresIn !== null ? expiresIn + ' min' : 'session'}, valid: ${hasValidCfClearance}`);
325 } else {
326 log.info('No cf_clearance cookie present in page context.');
327 }
328
329
330
331 if (!hasValidCfClearance && !session?.userData?.warmedUp) {
332 log.info('No valid cf_clearance found — pre-warming session via homepage...');
333 try {
334 await page.goto('https://www.sahibinden.com', {
335 waitUntil: 'networkidle2',
336 timeout: 60000,
337 });
338 const warmContent = await page.content();
339 if (isChallengedPage(warmContent)) {
340 log.info('CF challenge on homepage during pre-warm, waiting for auto-resolution...');
341 await saveDebugInfo(page, 'prewarm-challenge');
342 try {
343 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
344 const afterWarm = await page.content();
345 if (isChallengedPage(afterWarm)) {
346 log.warning('Pre-warm CF challenge did not resolve. Will attempt target URL anyway.');
347 await saveDebugInfo(page, 'prewarm-challenge-unresolved');
348 } else {
349 log.info('Pre-warm CF challenge resolved.');
350 }
351 } catch (e) {
352 log.warning(`Pre-warm CF challenge wait timed out: ${e.message}`);
353 }
354 } else {
355 log.info('Pre-warm homepage loaded successfully (no challenge).');
356 }
357 if (session) session.userData = { ...session.userData, warmedUp: true };
358 await randomDelay(1500, 3000);
359 } catch (e) {
360 log.warning(`Session pre-warm failed: ${e.message}`);
361 }
362 } else if (hasValidCfClearance) {
363 log.debug('Valid cf_clearance present, skipping pre-warm.');
364 }
365
366
367
368
369 let ua = session?.userData?.userAgent;
370 if (!ua) {
371 ua = randomUserAgent();
372 if (session) session.userData = { ...session.userData, userAgent: ua };
373 log.debug(`Assigned user agent for session: ${ua}`);
374 }
375 await page.setUserAgent(ua);
376
377
378
379 const chromeVerMatch = ua.match(/Chrome\/(\d+)/);
380
381
382
383
384 const requestLabel = request.userData?.label;
385 const isDetailPage = requestLabel === 'DETAIL';
386 const sourceUrl = request.userData?.listingData?.sourceUrl;
387
388 const extraHeaders = {
389 'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
390 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
391 'Sec-Fetch-Dest': 'document',
392 'Sec-Fetch-Mode': 'navigate',
393 'Sec-Fetch-Site': isDetailPage ? 'same-origin' : 'none',
394 'Sec-Fetch-User': '?1',
395 'Upgrade-Insecure-Requests': '1',
396 };
397 if (isDetailPage && sourceUrl) {
398 extraHeaders['Referer'] = sourceUrl;
399 }
400 if (chromeVerMatch) {
401 const v = chromeVerMatch[1];
402 extraHeaders['sec-ch-ua'] = `"Not A(Brand";v="99", "Google Chrome";v="${v}", "Chromium";v="${v}"`;
403 extraHeaders['sec-ch-ua-mobile'] = '?0';
404 extraHeaders['sec-ch-ua-platform'] = '"Windows"';
405 }
406 await page.setExtraHTTPHeaders(extraHeaders);
407
408
409 if (isDetailPage) {
410 await randomDelay(4000, 8000);
411 }
412
413 await page.setViewport({ width: 1920, height: 1080 });
414
415
416 await page.evaluateOnNewDocument(() => {
417 Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
418
419 window.chrome = {
420 runtime: {},
421 loadTimes: function () { },
422 csi: function () { },
423 app: {},
424 };
425
426 const originalQuery = window.navigator.permissions.query;
427 window.navigator.permissions.query = parameters => (
428 parameters.name === 'notifications'
429 ? Promise.resolve({ state: Notification.permission })
430 : originalQuery(parameters)
431 );
432
433
434 Object.defineProperty(navigator, 'languages', { get: () => ['tr-TR', 'tr', 'en-US', 'en'] });
435 Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
436 Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
437 Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
438 Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
439
440
441 Object.defineProperty(screen, 'width', { get: () => 1920 });
442 Object.defineProperty(screen, 'height', { get: () => 1080 });
443 Object.defineProperty(screen, 'availWidth', { get: () => 1920 });
444 Object.defineProperty(screen, 'availHeight', { get: () => 1040 });
445 Object.defineProperty(screen, 'colorDepth', { get: () => 24 });
446 Object.defineProperty(screen, 'pixelDepth', { get: () => 24 });
447
448
449 Object.defineProperty(navigator, 'plugins', {
450 get: () => {
451 const plugins = [
452 { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
453 { name: 'Chrome PDF Viewer', filename: 'mhjimihiapuabedfglidnhagcfenogec', description: '' },
454 { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
455 ];
456 plugins.item = (i) => plugins[i];
457 plugins.namedItem = (name) => plugins.find(p => p.name === name);
458 plugins.refresh = () => { };
459 plugins[Symbol.iterator] = function* () { yield* Object.values(plugins); };
460 Object.setPrototypeOf(plugins, PluginArray.prototype);
461 return plugins;
462 },
463 });
464
465
466 const getParameter = WebGLRenderingContext.prototype.getParameter;
467 WebGLRenderingContext.prototype.getParameter = function (parameter) {
468 if (parameter === 37445) return 'Intel Inc.';
469 if (parameter === 37446) return 'Intel Iris OpenGL Engine';
470 return getParameter.call(this, parameter);
471 };
472 });
473
474 if (gotoOptions) {
475 gotoOptions.waitUntil = 'networkidle2';
476 gotoOptions.timeout = 90000;
477 }
478 },
479 ],
480
481 postNavigationHooks: [
482 async ({ page, response, request, session }) => {
483 const statusCode = response?.status();
484 log.info(`Response status: ${statusCode} for ${request.url}`);
485
486 if (statusCode === 403 || statusCode === 503 || statusCode === 429) {
487 log.warning(`Got ${statusCode} for ${request.url} — checking page content...`);
488 await saveDebugInfo(page, `${statusCode}-initial`);
489
490 await randomDelay(5000, 10000);
491
492 try {
493 await page.mouse.move(100 + Math.random() * 500, 100 + Math.random() * 500);
494 await page.mouse.move(200 + Math.random() * 500, 200 + Math.random() * 500);
495 } catch (e) { }
496
497 const content = await page.content();
498
499 if (isPxHoldChallenge(content)) {
500
501 log.info('PerimeterX hold challenge detected — attempting automated hold...');
502 await saveDebugInfo(page, `${statusCode}-px-hold`);
503 const pxSolved = await tryHoldPxButton(page);
504 if (!pxSolved) {
505 log.warning('PerimeterX hold challenge failed. Provide _px3/_pxhd/_pxvid/pxcts cookies from your browser to bypass this.');
506 await saveDebugInfo(page, `${statusCode}-px-hold-failed`);
507 if (session) session.markBad();
508 throw new Error('PerimeterX hold challenge not resolved');
509 }
510 } else if (isChallengedPage(content)) {
511 log.info('Cloudflare challenge page detected, waiting for auto-resolution...');
512 await saveDebugInfo(page, `${statusCode}-challenge`);
513
514
515 try {
516 const devamEtClicked = await page.evaluate(() => {
517 const buttons = Array.from(document.querySelectorAll('button, a, input[type="submit"], div'));
518 const targetBtn = buttons.find(b => b.textContent && b.textContent.includes('Devam Et'));
519 if (targetBtn) { targetBtn.click(); return true; }
520 return false;
521 });
522 if (devamEtClicked) log.info('Clicked "Devam Et" button.');
523 } catch (e) { }
524
525 try {
526 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 45000 });
527
528 const resolvedContent = await page.content();
529 if (isPxHoldChallenge(resolvedContent)) {
530 log.info('CF resolved but now hit PX hold challenge — attempting hold...');
531 await saveDebugInfo(page, `${statusCode}-cf-then-px`);
532 const pxSolved = await tryHoldPxButton(page);
533 if (!pxSolved) {
534 if (session) session.markBad();
535 throw new Error('PerimeterX hold challenge not resolved after CF');
536 }
537 } else if (isChallengedPage(resolvedContent)) {
538 log.warning('Cloudflare challenge navigated to another challenge page. Marking session bad.');
539 await saveDebugInfo(page, `${statusCode}-challenge-still-blocked`);
540 if (session) session.markBad();
541 throw new Error('Cloudflare Turnstile challenge requires manual verification');
542 } else {
543 log.info('Cloudflare challenge resolved!');
544 }
545 } catch (e) {
546 if (e.message.includes('Turnstile') || e.message.includes('PerimeterX')) throw e;
547 log.warning('Cloudflare challenge did not resolve in time. Retrying...');
548 await saveDebugInfo(page, `${statusCode}-challenge-timeout`);
549 if (session) session.markBad();
550 throw new Error('Cloudflare challenge timeout');
551 }
552 } else {
553 log.warning('Received 403 without recognized challenge page. Marking session as bad.');
554 await saveDebugInfo(page, `${statusCode}-unknown-block`);
555 if (session) session.markBad();
556 throw new Error(`Blocked with status ${statusCode}`);
557 }
558 }
559
560
561 const currentUrl = page.url();
562 if (currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading')) {
563 log.info('Detected tloading/checkLoading protection page, waiting for JS redirect...');
564 try {
565 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
566 log.info(`tloading resolved, now at: ${page.url()}`);
567 } catch (e) {
568 log.warning('tloading page did not redirect in time. Marking session bad and retrying.');
569 if (session) session.markBad();
570 throw new Error('tloading page did not resolve');
571 }
572 }
573
574
575 if (page.url().includes('/giris') || page.url().includes('secure.sahibinden.com')) {
576 log.error('Redirected to login page. Your session cookies are missing or expired.');
577 if (session) session.markBad();
578 throw new Error('Mandatory login required. Please update the sessionCookies input.');
579 }
580
581 if (statusCode && statusCode >= 200 && statusCode < 300) {
582 if (session) session.markGood();
583 }
584 },
585 ],
586
587 requestHandler: async ({ page, request, enqueueLinks, session }) => {
588 const label = request.userData?.label || 'CATEGORY';
589 log.info(`Processing page [${label}]: ${request.url}`);
590
591
592 await randomDelay(2000, 5000);
593
594 try {
595
596
597 let currentUrl = page.url();
598 let pageTitle = await page.title().catch(() => '');
599
600 if (currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading') || pageTitle.includes('Yükleniyor') || pageTitle.includes('Bir dakika')) {
601 log.warning('Detected intermediate page in requestHandler. Waiting for automatic redirect...');
602
603 let waited = 0;
604 while ((currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading') || pageTitle.includes('Yükleniyor') || pageTitle.includes('Bir dakika')) && waited < 30) {
605 await randomDelay(1000, 1500);
606
607 try {
608 const devamEtClicked = await page.evaluate(() => {
609 const buttons = Array.from(document.querySelectorAll('button, a, input[type="submit"], div'));
610 const targetBtn = buttons.find(b => b.textContent && b.textContent.includes('Devam Et'));
611 if (targetBtn) { targetBtn.click(); return true; }
612 return false;
613 });
614 if (devamEtClicked) log.info('Clicked "Devam Et" button during tloading wait.');
615 } catch (e) { }
616
617 currentUrl = page.url();
618 pageTitle = await page.title().catch(() => '');
619 waited++;
620 }
621
622 if (currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading') || pageTitle.includes('Yükleniyor') || pageTitle.includes('Bir dakika')) {
623 await saveDebugInfo(page, 'tloading-stuck');
624 if (session) session.markBad();
625 throw new Error('Stuck on intermediate loading page');
626 }
627 log.info(`Successfully passed intermediate page. Now on: ${currentUrl}`);
628 }
629
630
631
632
633 const pageContent = await page.content();
634 if (isPxHoldChallenge(pageContent)) {
635 log.info('PerimeterX challenge detected in requestHandler — attempting hold...');
636 await saveDebugInfo(page, 'px-hold-in-handler');
637 const pxSolved = await tryHoldPxButton(page);
638 if (!pxSolved) {
639 if (session) session.markBad();
640 throw new Error('PerimeterX hold challenge not resolved in requestHandler');
641 }
642 } else if (
643 pageContent.includes('Just a moment') ||
644 pageContent.includes('Checking your browser') ||
645 pageContent.includes('cf-browser-verification')
646 ) {
647 log.warning('Cloudflare challenge still present in requestHandler, waiting...');
648 await saveDebugInfo(page, 'cf-challenge-in-handler');
649 await randomDelay(8000, 15000);
650 await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 }).catch(() => { });
651
652 const newContent = await page.content();
653 if (
654 newContent.includes('Just a moment') ||
655 newContent.includes('Checking your browser') ||
656 newContent.includes('cf-browser-verification') ||
657 isPxHoldChallenge(newContent)
658 ) {
659 if (session) session.markBad();
660 throw new Error('Challenge not resolved in requestHandler');
661 }
662 }
663
664 await page.waitForSelector('body', { timeout: 45000 });
665
666 if (label === 'DETAIL') {
667 await handleDetailPage(page, request);
668 } else {
669 await handleCategoryPage(page, request, enqueueLinks);
670 }
671
672 } catch (error) {
673 const errorMessage = error instanceof Error ? error.message : String(error);
674 log.error(`Error processing ${request.url}: ${errorMessage}`, {
675 stack: error instanceof Error ? error.stack : undefined,
676 });
677 throw error;
678 }
679 },
680
681 failedRequestHandler: async ({ request }) => {
682 log.error(`Request failed after retries: ${request.url}`, {
683 errors: request.errorMessages,
684 });
685 },
686});
687
688
689
690
691const originalThrowOnBlocked = crawler._throwOnBlockedRequest?.bind(crawler);
692if (originalThrowOnBlocked) {
693 crawler._throwOnBlockedRequest = function (session, statusCode) {
694 if (statusCode === 403 || statusCode === 503) {
695 log.debug(`Suppressing Crawlee's built-in ${statusCode} block check (handled by postNavigationHook)`);
696 return;
697 }
698 return originalThrowOnBlocked(session, statusCode);
699 };
700 log.info('Overridden Crawlee blocked request check for Cloudflare compatibility');
701}
702
703
704
705
706async function handleCategoryPage(page, request, enqueueLinks) {
707 log.info(`Handling category page: ${request.url}`);
708
709 const listingRowSelector = 'tbody.searchResultsRowClass > tr.searchResultsItem';
710 const titleLinkSelector = 'td.searchResultsTitleValue a.classifiedTitle';
711 const priceSelector = 'td.searchResultsPriceValue span';
712 const dateSelector = 'td.searchResultsDateValue';
713 const locationSelector = 'td.searchResultsLocationValue';
714 const nextPageSelector = 'a.prevNextBut[title="Sonraki"]:not(.passive)';
715
716 try {
717 let listingElements = [];
718 try {
719 await page.waitForSelector(listingRowSelector, { timeout: 15000 });
720 listingElements = await page.$$(listingRowSelector);
721 if (debugMode) await saveDebugInfo(page, 'category-loaded');
722 } catch (e) {
723
724 let currentUrl = page.url();
725 let pageTitle = await page.title().catch(() => '');
726
727 if (currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading') || pageTitle.includes('Yükleniyor') || pageTitle.includes('Bir dakika')) {
728 log.warning('Detected intermediate redirect during selector wait. Waiting for resolution...');
729
730 let waited = 0;
731 while ((currentUrl.includes('/cs/tloading') || currentUrl.includes('/cs/checkLoading') || pageTitle.includes('Yükleniyor') || pageTitle.includes('Bir dakika')) && waited < 30) {
732 await randomDelay(1000, 1500);
733 currentUrl = page.url();
734 pageTitle = await page.title().catch(() => '');
735 waited++;
736 }
737
738 log.info('Navigation complete. Retrying primary selector... Current URL: ' + page.url());
739 await page.waitForSelector(listingRowSelector, { timeout: 15000 }).catch(() => { });
740 listingElements = await page.$$(listingRowSelector);
741 }
742
743 if (listingElements.length === 0) {
744 log.warning(`Primary selector failed: ${listingRowSelector}`);
745 await saveDebugInfo(page, 'category-selector-failed');
746
747 const pageTitleDebug = await page.title().catch(() => 'unknown');
748 const currentUrlDebug = page.url();
749 log.info('Page state when selector failed:', { title: pageTitleDebug, url: currentUrlDebug });
750
751 const alternativeSelectors = [
752 'table.searchResultsTable tr.searchResultsItem',
753 '.searchResultsRowClass .searchResultsItem',
754 'tr.searchResultsItem',
755 '.classified-list-item',
756 '[data-id]',
757 '.searchResults .result-item',
758 'table tr[data-id]',
759 ];
760
761 for (const altSelector of alternativeSelectors) {
762 const altElements = await page.$$(altSelector);
763 if (altElements.length > 0) {
764 log.info(`Found ${altElements.length} elements with alternative selector: ${altSelector}`);
765 listingElements = altElements;
766 break;
767 }
768 }
769
770 if (listingElements.length === 0) {
771 const tableCount = await page.$$eval('table', tables => tables.length).catch(() => 0);
772 const trCount = await page.$$eval('tr', rows => rows.length).catch(() => 0);
773 const tbodyCount = await page.$$eval('tbody', bodies => bodies.length).catch(() => 0);
774 log.info('DEBUG: Page structure', { tables: tableCount, rows: trCount, tbodies: tbodyCount });
775
776 const searchClasses = await page.evaluate(() => {
777 const allElements = document.querySelectorAll('*');
778 const classes = new Set();
779 allElements.forEach(el => {
780 if (el.className && typeof el.className === 'string') {
781 el.className.split(' ').forEach(cls => {
782 if (cls.toLowerCase().includes('search') || cls.toLowerCase().includes('result') || cls.toLowerCase().includes('listing') || cls.toLowerCase().includes('classified')) {
783 classes.add(cls);
784 }
785 });
786 }
787 });
788 return Array.from(classes);
789 }).catch(() => []);
790 log.info('DEBUG: Relevant CSS classes found:', { classes: searchClasses });
791
792 throw new Error('No listing elements found with any selector');
793 }
794 }
795 }
796
797 log.info(`Found ${listingElements.length} listings on page.`);
798
799 const results = [];
800
801 for (const element of listingElements) {
802
803 if (maxItems !== null && scrapedItemsCount >= maxItems) {
804 log.info(`Maximum items limit (${maxItems}) reached. Stopping scrape.`);
805
806 if (results.length > 0) {
807 await Actor.pushData(results);
808 if (baseRowIntegration) {
809 try {
810 await baseRowIntegration.storeListings(results);
811 } catch (error) {
812 log.warning('Failed to store data in BaseRow', { error: error.message });
813 }
814 }
815 }
816
817 await crawler.autoscaledPool?.abort();
818 return;
819 }
820
821 try {
822
823 const titleElement = await element.$(titleLinkSelector);
824 const title = await titleElement?.evaluate(el => el.textContent?.trim()).catch(() => null);
825 const detailUrl = await titleElement?.evaluate(el => el.href).catch(() => null);
826
827 if (!title || !detailUrl) {
828 log.debug('Skipping row due to missing title or detailUrl.');
829 continue;
830 }
831
832
833 const priceText = await element.$eval(priceSelector, el => el.textContent?.trim()).catch(() => null);
834
835
836 const location = await element.$eval(locationSelector, el => {
837 return el.innerText?.trim().replace(/\n/g, ' / ');
838 }).catch(() => null);
839
840
841 const date = await element.$eval(dateSelector, el => {
842 return el.innerText?.trim().replace(/\n/g, ' ');
843 }).catch(() => null);
844
845
846 const image = await element.$eval('img', el => el.src || el.dataset?.src || null).catch(() => null);
847
848
849 const id = await element.evaluate(el => el.getAttribute('data-id')).catch(() => null)
850 ?? extractListingId(detailUrl);
851
852
853
854 const tagAttributes = await element.$$eval('td.searchResultsTagAttributeValue', cells =>
855 cells.map(cell => cell.textContent?.trim() || '')
856 ).catch(() => []);
857
858 const attributes = await element.$$eval('td.searchResultsAttributeValue', cells =>
859 cells.map(cell => cell.textContent?.trim() || '')
860 ).catch(() => []);
861
862
863 let make = null, series = null, model = null;
864 if (tagAttributes.length === 3) {
865 [make, series, model] = tagAttributes;
866 } else if (tagAttributes.length === 2) {
867 [series, model] = tagAttributes;
868 } else if (tagAttributes.length === 1) {
869 model = tagAttributes[0];
870 }
871
872
873 const year = attributes[0] || null;
874 const km = attributes[1] || null;
875 const color = attributes[2] || null;
876
877 const listingData = {
878 id,
879 url: detailUrl,
880 title: normalizeText(title),
881 make: make ? normalizeText(make) : null,
882 series: series ? normalizeText(series) : null,
883 model: model ? normalizeText(model) : null,
884 year: year ? normalizeText(year) : null,
885 km: km ? normalizeText(km) : null,
886 color: color ? normalizeText(color) : null,
887 price: formatPrice(priceText),
888 price_currency: extractCurrency(priceText),
889 price_raw: priceText,
890 location: normalizeText(location),
891 date: normalizeText(date),
892 image,
893 scrapedAt: new Date().toISOString(),
894 sourceUrl: request.url,
895 };
896
897
898 if (includeDetails && detailUrl) {
899 await enqueueLinks({
900 urls: [detailUrl],
901 userData: {
902 label: 'DETAIL',
903 listingData,
904 },
905 });
906
907 } else {
908 results.push(listingData);
909 scrapedItemsCount++;
910 }
911
912 } catch (extractError) {
913 const errorMsg = extractError instanceof Error ? extractError.message : String(extractError);
914 log.warning(`Could not process one item on ${request.url}`, { error: errorMsg });
915 }
916 }
917
918
919 if (results.length > 0) {
920 await Actor.pushData(results);
921 log.info(`Pushed ${results.length} listings from page. Total scraped: ${scrapedItemsCount}`);
922
923 if (baseRowIntegration) {
924 try {
925 await baseRowIntegration.storeListings(results);
926 } catch (error) {
927 log.warning('Failed to store data in BaseRow', { error: error.message });
928 }
929 }
930 } else if (!includeDetails) {
931 log.info(`No listings extracted from page ${request.url}.`);
932 }
933
934
935 if (maxItems !== null && scrapedItemsCount >= maxItems) {
936 log.info(`Maximum items limit (${maxItems}) reached. Not enqueueing next page.`);
937 await crawler.autoscaledPool?.abort();
938 return;
939 }
940
941 const nextPageUrl = await page.$eval(nextPageSelector, anchor => anchor.href).catch(() => null);
942 if (nextPageUrl) {
943 log.info(`Enqueueing next category page: ${nextPageUrl}`);
944 const absoluteNextPageUrl = new URL(nextPageUrl, request.loadedUrl || request.url).toString();
945 await enqueueLinks({
946 urls: [absoluteNextPageUrl],
947 userData: { label: 'CATEGORY' },
948 });
949 await randomDelay(1000, 3000);
950 } else {
951 log.info(`No next page button found on ${request.url}`);
952 }
953
954 } catch (error) {
955 const errorMessage = error instanceof Error ? error.message : String(error);
956 log.warning(`Could not handle category page ${request.url}: ${errorMessage}`);
957 }
958}
959
960
961
962
963async function handleDetailPage(page, request) {
964 log.info(`Handling detail page: ${request.url}`);
965
966 const listingData = request.userData?.listingData || {};
967
968 try {
969 await page.waitForSelector('body', { timeout: 30000 });
970 await randomDelay(1000, 3000);
971
972 if (debugMode) await saveDebugInfo(page, 'detail-loaded');
973
974
975 const description = await page.$eval('#classifiedDescription', el => {
976 return el.textContent?.trim() || '';
977 }).catch(() => '');
978
979
980 const info = {};
981 try {
982 const infoItems = await page.$$('.classifiedInfoList li');
983 for (const item of infoItems) {
984 const label = await item.$eval('strong', el => el.textContent?.trim()).catch(() => null);
985 const value = await item.$eval('span', el => el.textContent?.trim()).catch(() => null);
986 if (label && value) {
987 info[normalizeText(label)] = normalizeText(value);
988 }
989 }
990 } catch (e) {
991 log.debug('Could not extract info list', { error: e.message });
992 }
993
994
995 const images = await page.$$eval(
996 '.classifiedDetailMainPhoto img, .swiper-slide img, #classifiedDetailPhotos img',
997 imgs => imgs.map(img => img.src || img.dataset?.src).filter(Boolean)
998 ).catch(() => []);
999
1000 const uniqueImages = [...new Set(images)];
1001
1002
1003 const seller = await page.$eval(
1004 '.classifiedUserContent h5, .classifiedOtherBoxes .username-info-area',
1005 el => el.textContent?.trim()
1006 ).catch(() => null);
1007
1008
1009 const pageId = await page.$eval(
1010 '.classifiedId',
1011 el => el.textContent?.replace(/[^0-9]/g, '')
1012 ).catch(() => null);
1013
1014 const completeData = {
1015 ...listingData,
1016 id: listingData.id || pageId || extractListingId(request.url),
1017 description: normalizeText(description),
1018 images: uniqueImages,
1019 seller: seller ? normalizeText(seller) : null,
1020 info,
1021
1022 make: info['Marka'] || listingData.make || null,
1023 series: info['Seri'] || listingData.series || null,
1024 model: info['Model'] || listingData.model || null,
1025 year: info['Yıl'] || listingData.year || null,
1026 fuel: info['Yakıt'] || null,
1027 gear: info['Vites'] || null,
1028 km: info['KM'] || listingData.km || null,
1029 bodyType: info['Kasa Tipi'] || null,
1030 enginePower: info['Motor Gücü'] || null,
1031 engineCapacity: info['Motor Hacmi'] || null,
1032 traction: info['Çekiş'] || null,
1033 color: info['Renk'] || listingData.color || null,
1034 warranty: info['Garanti'] || null,
1035 damageRecord: info['Ağır Hasar Kayıtlı'] || info['Hasar Durumu'] || null,
1036 plate: info['Plaka / Uyruk'] || null,
1037 fromWho: info['Kimden'] || null,
1038 };
1039
1040 await Actor.pushData(completeData);
1041 scrapedItemsCount++;
1042 log.info(`Pushed detail data for listing ${completeData.id}. Total scraped: ${scrapedItemsCount}`);
1043
1044 if (baseRowIntegration) {
1045 try {
1046 await baseRowIntegration.storeListing(completeData);
1047 } catch (error) {
1048 log.warning('Failed to store detail data in BaseRow', { error: error.message });
1049 }
1050 }
1051
1052 if (maxItems !== null && scrapedItemsCount >= maxItems) {
1053 log.info(`Maximum items limit (${maxItems}) reached.`);
1054 await crawler.autoscaledPool?.abort();
1055 }
1056
1057 } catch (error) {
1058 const errorMessage = error instanceof Error ? error.message : String(error);
1059 log.warning(`Could not handle detail page ${request.url}: ${errorMessage}`);
1060
1061
1062 if (listingData.title) {
1063 await Actor.pushData(listingData);
1064 scrapedItemsCount++;
1065 }
1066 }
1067}
1068
1069
1070
1071
1072const startRequests = (Array.isArray(startUrls) ? startUrls : [startUrls]).map(item => {
1073 let urlString;
1074 if (typeof item === 'string') {
1075 urlString = item;
1076 } else if (item && typeof item.url === 'string') {
1077 urlString = item.url;
1078 } else {
1079 log.warning('Skipping invalid start URL item:', { item });
1080 return null;
1081 }
1082
1083 if (!urlString || !urlString.startsWith('http')) {
1084 log.warning('Skipping item with invalid URL string:', { urlString });
1085 return null;
1086 }
1087
1088 const isDetailUrl = urlString.includes('/ilan/') && urlString.includes('/detay');
1089 return { url: urlString, userData: { label: isDetailUrl ? 'DETAIL' : 'CATEGORY' } };
1090}).filter(req => req !== null);
1091
1092if (startRequests.length > 0) {
1093 await crawler.addRequests(startRequests);
1094 log.info(`Added ${startRequests.length} initial requests to the queue.`);
1095} else {
1096 log.warning('No valid start URLs found in the input. Exiting.');
1097 await Actor.exit(1, 'No valid start URLs provided.');
1098}
1099
1100log.info('Starting the crawler...');
1101await crawler.run();
1102log.info(`Crawler finished. Total items scraped: ${scrapedItemsCount}`);
1103
1104await Actor.exit();