sahibinden-scraper-puppeteer-js
Under maintenance
Pricing
Pay per usage
Go to Store
sahibinden-scraper-puppeteer-js
Under maintenance
0.0 (0)
Pricing
Pay per usage
2
Total users
23
Monthly users
19
Runs succeeded
96%
Last modified
a month ago
.actor/Dockerfile
# Specify the base Docker image.FROM apify/actor-node-puppeteer-chrome:20 AS intermediate
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.jsonCOPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.# NOTE: No dev dependencies anymore, but install includes production ones.RUN npm install --include=dev --audit=false
# Copy the source files using the user set# in the base image.COPY . ./
# NO 'npm run build' STEP NEEDED HERE ANYMORE
# Create final imageFROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.jsonCOPY package*.json ./
# Install ONLY production dependenciesRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# NO COPY from builder for /dist NEEDED HERE ANYMORE
# Copy the remaining files and directories with the source code.COPY . ./
# Run the image.CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Project Puppeteer Crawler Typescript", "description": "Crawlee and Puppeteer project in typescript.", "version": "0.0", "meta": { "templateId": "ts-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "PuppeteerCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "URLs to start with.", "editor": "requestListSources", "prefill": [ { "url": "https://apify.com" } ] } }}
src/baserow.js
1import axios from 'axios';2import { Actor, log } from 'apify';3
4/**5 * BaseRow integration for storing scraped car listings6 */7export class BaseRowIntegration {8 /**9 * Initialize BaseRow integration10 * @param {Object} options Configuration options11 * @param {string} options.apiToken BaseRow API token12 * @param {string} options.tableId BaseRow table ID13 * @param {string} options.databaseId BaseRow database ID14 */15 constructor(options) {16 const { apiToken, tableId, databaseId } = options;17 18 if (!apiToken) throw new Error('BaseRow API token is required');19 if (!tableId) throw new Error('BaseRow table ID is required');20 if (!databaseId) throw new Error('BaseRow database ID is required');21 22 this.apiToken = apiToken;23 this.tableId = tableId;24 this.databaseId = databaseId;25 this.baseUrl = 'https://api.baserow.io/api';26 this.client = axios.create({27 baseURL: this.baseUrl,28 headers: {29 'Authorization': `Token ${this.apiToken}`,30 'Content-Type': 'application/json'31 }32 });33 34 log.info('BaseRow integration initialized', { tableId, databaseId });35 }36 37 /**38 * Store a car listing in BaseRow39 * @param {Object} carData Car listing data40 * @returns {Promise<Object>} Created row data41 */42 async storeCarListing(carData) {43 try {44 // Prepare data for BaseRow45 const rowData = this._prepareRowData(carData);46 47 // Check if listing already exists to avoid duplicates48 const existingRow = await this._findExistingListing(carData.id);49 50 if (existingRow) {51 log.info(`Updating existing listing: ${carData.id}`);52 return await this._updateRow(existingRow.id, rowData);53 } else {54 log.info(`Creating new listing: ${carData.id}`);55 return await this._createRow(rowData);56 }57 } catch (error) {58 log.error(`Error storing car listing in BaseRow: ${error.message}`);59 throw error;60 }61 }62 63 /**64 * Store multiple car listings in BaseRow65 * @param {Array<Object>} carListings Array of car listing data66 * @returns {Promise<Array<Object>>} Created/updated row data67 */68 async storeCarListings(carListings) {69 log.info(`Storing ${carListings.length} car listings in BaseRow`);70 71 const results = [];72 for (const carData of carListings) {73 try {74 const result = await this.storeCarListing(carData);75 results.push(result);76 } catch (error) {77 log.error(`Error storing car listing ${carData.id}: ${error.message}`);78 // Continue with next listing79 }80 }81 82 log.info(`Successfully stored ${results.length} out of ${carListings.length} car listings`);83 return results;84 }85 86 /**87 * Find an existing listing by ID88 * @param {string} listingId Sahibinden.com listing ID89 * @returns {Promise<Object|null>} Existing row or null if not found90 * @private91 */92 async _findExistingListing(listingId) {93 try {94 const response = await this.client.get(95 `/database/rows/table/${this.tableId}/`,96 {97 params: {98 search: listingId,99 user_field_names: true100 }101 }102 );103 104 const rows = response.data.results;105 return rows.find(row => row.id === listingId || row.listing_id === listingId) || null;106 } catch (error) {107 log.error(`Error finding existing listing: ${error.message}`);108 return null;109 }110 }111 112 /**113 * Create a new row in BaseRow114 * @param {Object} rowData Row data115 * @returns {Promise<Object>} Created row data116 * @private117 */118 async _createRow(rowData) {119 const response = await this.client.post(120 `/database/rows/table/${this.tableId}/`,121 rowData,122 {123 params: {124 user_field_names: true125 }126 }127 );128 129 return response.data;130 }131 132 /**133 * Update an existing row in BaseRow134 * @param {number} rowId BaseRow row ID135 * @param {Object} rowData Row data136 * @returns {Promise<Object>} Updated row data137 * @private138 */139 async _updateRow(rowId, rowData) {140 const response = await this.client.patch(141 `/database/rows/table/${this.tableId}/${rowId}/`,142 rowData,143 {144 params: {145 user_field_names: true146 }147 }148 );149 150 return response.data;151 }152 153 /**154 * Prepare car data for BaseRow155 * @param {Object} carData Car listing data156 * @returns {Object} Prepared row data157 * @private158 */159 _prepareRowData(carData) {160 // Map car data to BaseRow fields161 // Field names should match the BaseRow table structure162 return {163 listing_id: carData.id,164 url: carData.url,165 title: carData.title,166 price: carData.price,167 price_currency: carData.price_currency,168 location: carData.location,169 description: carData.description,170 171 // Main info fields172 make: carData.info?.Marka || '',173 model: carData.info?.Model || '',174 series: carData.info?.Seri || '',175 year: carData.info?.Yıl || '',176 fuel_type: carData.info?.Yakıt || '',177 transmission: carData.info?.Vites || '',178 mileage: carData.info?.KM || '',179 body_type: carData.info?.['Kasa Tipi'] || '',180 engine_power: carData.info?.['Motor Gücü'] || '',181 engine_capacity: carData.info?.['Motor Hacmi'] || '',182 drive_type: carData.info?.Çekiş || '',183 doors: carData.info?.Kapı || '',184 color: carData.info?.Renk || '',185 warranty: carData.info?.Garanti || '',186 damage_record: carData.info?.['Ağır Hasar Kayıtlı'] || '',187 plate_nationality: carData.info?.['Plaka / Uyruk'] || '',188 seller_type: carData.info?.Kimden || '',189 trade_in: carData.info?.Takas || '',190 condition: carData.info?.Durumu || '',191 192 // Images as JSON string193 images: JSON.stringify(carData.images || []),194 195 // Attributes and technical specs as JSON strings196 attributes: JSON.stringify(carData.attributes || {}),197 technical_specs: JSON.stringify(carData.technicalSpecs || {}),198 199 // Metadata200 scraped_at: carData.scrapedAt,201 last_updated: new Date().toISOString()202 };203 }204}205
206/**207 * Create BaseRow integration from Actor input208 * @returns {Promise<BaseRowIntegration>} BaseRow integration instance209 */210export async function createBaseRowIntegration() {211 const input = await Actor.getInput() || {};212 const { baseRowApiToken, baseRowTableId, baseRowDatabaseId } = input;213 214 if (!baseRowApiToken || !baseRowTableId || !baseRowDatabaseId) {215 log.warning('BaseRow integration not configured. Data will only be stored in Apify dataset.');216 return null;217 }218 219 return new BaseRowIntegration({220 apiToken: baseRowApiToken,221 tableId: baseRowTableId,222 databaseId: baseRowDatabaseId223 });224}
src/main.js
1// src/main.js - Puppeteer JavaScript Version for Bulk Category Scraping2import { Actor } from 'apify';3import { PuppeteerCrawler, log } from 'crawlee';4// Import puppeteer-extra and the stealth plugin5import puppeteer from 'puppeteer-extra';6import StealthPlugin from 'puppeteer-extra-plugin-stealth';7
8// Apply the stealth plugin to puppeteer9puppeteer.use(StealthPlugin());10
11// Initialize the Apify Actor12await Actor.init();13
14// Get input - simplified for bulk scraping15const input = await Actor.getInput() || {};16const {17 startUrls = ['https://www.sahibinden.com/otomobil/elektrik'], // Specific category18 maxItems = null, // Optional: Limit total listings scraped (null for all)19 maxConcurrency = 5, // Can increase concurrency slightly for category pages20 maxRequestsPerCrawl = 500, // Increase to handle many pages21 proxyConfiguration = {22 useApifyProxy: true,23 apifyProxyGroups: ['RESIDENTIAL'],24 }25} = input;26
27// Create the final proxy configuration object28const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);29
30log.info('Starting BULK scraper', { startUrls, maxItems, maxConcurrency, maxRequestsPerCrawl });31if (proxyConfig) {32 log.info('Using proxy configuration', { type: proxyConfig.usesApifyProxy ? 'Apify Proxy' : 'Custom Proxies', groups: proxyConfig.apifyProxyGroups });33} else {34 log.info('No proxy configuration specified.');35}36
37let scrapedItemsCount = 0; // Counter for maxItems38
39// --- Helper function for random User Agent (keep as before) ---40function randomUserAgent() {41 const userAgents = [ /* ... Fill with many user agents ... */ ];42 return userAgents[Math.floor(Math.random() * userAgents.length)];43}44// -------------------------------------------45
46
47// Create the Puppeteer crawler48const crawler = new PuppeteerCrawler({49 proxyConfiguration: proxyConfig,50 maxConcurrency,51 maxRequestsPerCrawl,52 launchContext: {53 launcher: puppeteer,54 launchOptions: { headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] },55 useChrome: false56 },57 preNavigationHooks: [ /* ... Keep your existing preNavigationHooks ... */ ],58
59 requestHandler: async ({ page, request, enqueueLinks, log }) => {60 log.info(`Processing page: ${request.url}`);61 try {62 // ... Keep Cloudflare check logic ...63
64 await page.waitForSelector('body', { timeout: 45000 });65
66 // --- Always handle as category page now ---67 await handleCategoryPage(page, request, enqueueLinks, log);68
69 } catch (error) {70 let errorMessage = 'Unknown error during request handling';71 if (error instanceof Error) { errorMessage = error.message; } else { errorMessage = String(error); }72 log.error(`Error processing ${request.url}: ${errorMessage}`, { stack: error instanceof Error ? error.stack : undefined });73 throw error;74 }75 },76 failedRequestHandler: async ({ request, log }) => {77 // ... Keep existing failedRequestHandler ...78 },79});80
81// --- MODIFIED Handler Function for Category Pages ---82
83async function handleCategoryPage(page, request, enqueueLinks, log) {84 log.info(`Handling category page: ${request.url}`);85
86 // --- VERIFY THESE SELECTORS on https://www.sahibinden.com/otomobil/elektrik ---87 const listingRowSelector = 'tbody.searchResultsRowClass > tr.searchResultsItem'; // Row for each listing88 // Selectors relative to the listingRowSelector element:89 const makeSelector = 'td:nth-child(2)'; // Assuming Make is the 2nd column90 const seriesSelector = 'td:nth-child(3)'; // Assuming Series is the 3rd column91 const modelSelector = 'td:nth-child(4)'; // Assuming Model is the 4th column92 const titleLinkSelector = 'td.searchResultsTitleValue a.classifiedTitle'; // Title link93 const yearSelector = 'td:nth-child(7)'; // Assuming Year is the 7th column (VERIFY INDEX!)94 const kmSelector = 'td:nth-child(8)'; // Assuming KM is the 8th column (VERIFY INDEX!)95 const priceSelector = 'td.searchResultsPriceValue span'; // Price text96 const dateSelector = 'td.searchResultsDateValue'; // Date column97 const locationSelector = 'td.searchResultsLocationValue'; // Location column98 // Selector for the next page link:99 const nextPageSelector = 'a.prevNextBut[title="Sonraki"]:not(.passive)';100 // -------------------------------------------------------------------------101
102 try {103 await page.waitForSelector(listingRowSelector, { timeout: 45000 });104
105 const listingElements = await page.$$(listingRowSelector);106 log.info(`Found ${listingElements.length} listings on page.`);107
108 const results = []; // Collect results from this page109
110 for (const element of listingElements) {111
112 // Check if maxItems limit is reached113 if (maxItems !== null && scrapedItemsCount >= maxItems) {114 log.info(`Maximum items limit (${maxItems}) reached. Stopping scrape.`);115 // Abort the crawl cleanly116 await crawler.autoscaledPool.abort();117 return; // Stop processing this page118 }119
120 try {121 // Extract data relative to the current element 'element'122 const make = await element.$eval(makeSelector, el => el.textContent?.trim()).catch(() => null);123 const series = await element.$eval(seriesSelector, el => el.textContent?.trim()).catch(() => null);124 const model = await element.$eval(modelSelector, el => el.textContent?.trim()).catch(() => null);125 const titleElement = await element.$(titleLinkSelector); // Get handle for title/link126 const title = await titleElement?.evaluate(el => el.textContent?.trim()).catch(() => null);127 const detailUrl = await titleElement?.evaluate(el => el.href).catch(() => null);128 const year = await element.$eval(yearSelector, el => el.textContent?.trim()).catch(() => null);129 const km = await element.$eval(kmSelector, el => el.textContent?.trim()).catch(() => null);130 const price = await element.$eval(priceSelector, el => el.textContent?.trim()).catch(() => null);131 const date = await element.$eval(dateSelector, el => el.innerText?.trim().replace('\n', ' ')).catch(() => null); // Get innerText to handle <br>132 const location = await element.$eval(locationSelector, el => el.innerText?.trim().replace('<br>', ' / ')).catch(() => null); // Get innerText and replace <br>133
134 // Basic check if we got essential data135 if (title && detailUrl) {136 const listingData = {137 // Match field names exactly as requested138 'Marka': make,139 'Seri': series,140 'Model': model,141 'İlan Başlığı': title,142 'Yıl': year,143 'KM': km, // Keep as string for now, cleaning later is easier144 'Fiyat': price, // Keep as string145 'İlan Tarihi': date,146 'İl / İlçe': location,147 'Detay Linki': detailUrl // Good to store the link too148 };149 results.push(listingData);150 scrapedItemsCount++;151 } else {152 log.debug('Skipping row due to missing title or detailUrl.');153 }154
155 } catch (extractError) {156 let errorMsg = 'Unknown error extracting item data';157 if (extractError instanceof Error) { errorMsg = extractError.message; } else { errorMsg = String(extractError); }158 log.warning(`Could not process one item fully on ${request.url}`, { error: errorMsg });159 }160 } // End of loop161
162 // Push all results from this page163 if (results.length > 0) {164 await Actor.pushData(results);165 log.info(`Pushed ${results.length} listings from page ${request.url}. Total scraped: ${scrapedItemsCount}`);166 } else {167 log.info(`No listings extracted from page ${request.url}.`);168 }169
170 // --- Enqueue next page ---171 // Check if maxItems limit is reached before enqueueing next page172 if (maxItems !== null && scrapedItemsCount >= maxItems) {173 log.info(`Maximum items limit (${maxItems}) reached. Not enqueueing next page.`);174 await crawler.autoscaledPool.abort(); // Ensure crawl stops175 return;176 }177
178 const nextPageUrl = await page.$eval(nextPageSelector, anchor => anchor.href).catch(() => null);179 if (nextPageUrl) {180 log.info(`Enqueueing next category page: ${nextPageUrl}`);181 const absoluteNextPageUrl = new URL(nextPageUrl, request.loadedUrl || request.url).toString();182 // Add request with the same label 'CATEGORY'183 // NOTE: No label needed if only one handler, but good practice184 await enqueueLinks({185 urls: [absoluteNextPageUrl],186 userData: { label: 'CATEGORY' }, // Use label for consistency187 });188 } else {189 log.info(`No next page button found or clickable on ${request.url}`);190 }191
192 } catch (error) {193 let errorMessage = 'Unknown error handling category page';194 if (error instanceof Error) { errorMessage = error.message; } else { errorMessage = String(error); }195 log.warning(`Could not handle category page ${request.url}: ${errorMessage}`);196 // Consider if this should throw error to retry the page197 // throw error;198 }199}200
201// --- handleDetailPage is no longer needed for this approach ---202// async function handleDetailPage(page, request, log) { ... }203
204
205// --- Start the Crawler ---206const startRequests = startUrls.map(item => {207 // ... (Keep the existing logic to handle string or object input for startUrls) ...208 let urlString;209 if (typeof item === 'string') { urlString = item; }210 else if (item && typeof item.url === 'string') { urlString = item.url; }211 else { log.warning('Skipping invalid start URL item:', { item }); return null; }212
213 if (!urlString || !urlString.startsWith('http')) {214 log.warning('Skipping item with invalid URL string:', { urlString }); return null;215 }216 return { url: urlString, userData: { label: 'CATEGORY' } }; // Always label as CATEGORY now217}).filter(req => req !== null);218
219
220if (startRequests.length > 0) {221 await crawler.addRequests(startRequests);222 log.info(`Added ${startRequests.length} initial requests to the queue.`);223} else {224 log.warning('No valid start URLs found in the input. Exiting.');225 await Actor.exit(1, 'No valid start URLs provided.');226}227
228log.info('Starting the crawler...');229await crawler.run();230log.info(`Crawler finished. Total items scraped: ${scrapedItemsCount}`);231
232await Actor.exit();233// --- End of Script ---
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://apify.com/*'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 const title = await page.title();15 log.info(`${title}`, { url: request.loadedUrl });16
17 await Dataset.pushData({18 url: request.loadedUrl,19 title,20 });21});
src/scheduler.js
1import { Actor } from 'apify';2import { log } from 'crawlee';3import { createBaseRowIntegration } from './baserow.js';4
5/**6 * Scheduler for automating the Sahibinden.com scraper7 * This module handles the integration with Apify Scheduler8 */9export class ScraperScheduler {10 /**11 * Initialize the scheduler12 * @param {Object} options Configuration options13 * @param {string} options.scheduleInterval How often to run the scraper (e.g., 'daily', 'weekly')14 * @param {string} options.startTime Time to start the scraper (e.g., '02:00')15 * @param {Array<string>} options.startUrls URLs to start scraping from16 */17 constructor(options = {}) {18 const {19 scheduleInterval = 'daily',20 startTime = '02:00',21 startUrls = ['https://www.sahibinden.com/kategori/vasita']22 } = options;23 24 this.scheduleInterval = scheduleInterval;25 this.startTime = startTime;26 this.startUrls = startUrls;27 28 log.info('Scheduler initialized', { scheduleInterval, startTime, startUrls });29 }30 31 /**32 * Create an Apify scheduler configuration33 * @returns {Object} Scheduler configuration34 */35 createSchedulerConfig() {36 // Convert schedule interval to cron expression37 let cronExpression;38 switch (this.scheduleInterval.toLowerCase()) {39 case 'hourly':40 cronExpression = '0 * * * *';41 break;42 case 'daily':43 const [hours, minutes] = this.startTime.split(':').map(Number);44 cronExpression = `${minutes || 0} ${hours || 0} * * *`;45 break;46 case 'weekly':47 const [weeklyHours, weeklyMinutes] = this.startTime.split(':').map(Number);48 cronExpression = `${weeklyMinutes || 0} ${weeklyHours || 0} * * 1`; // Monday49 break;50 default:51 cronExpression = '0 2 * * *'; // Default: 2 AM daily52 }53 54 return {55 cronExpression,56 isEnabled: true,57 isExclusive: true,58 description: 'Scheduled run of Sahibinden.com scraper',59 actorTaskId: Actor.getEnv().actorTaskId || undefined,60 actorId: Actor.getEnv().actorId || undefined,61 input: {62 startUrls: this.startUrls,63 maxConcurrency: 1,64 maxRequestsPerCrawl: 1000,65 proxyConfiguration: {66 useApifyProxy: true,67 apifyProxyGroups: ['RESIDENTIAL'],68 countryCode: 'TR'69 }70 }71 };72 }73 74 /**75 * Set up the scheduler on Apify platform76 * @returns {Promise<Object>} Created scheduler77 */78 async setupScheduler() {79 try {80 const client = Actor.newClient();81 const config = this.createSchedulerConfig();82 83 log.info('Setting up scheduler with configuration', { cronExpression: config.cronExpression });84 85 // Create a new scheduler86 const scheduler = await client.schedules().create({87 name: 'Sahibinden.com Scraper Schedule',88 cronExpression: config.cronExpression,89 isEnabled: config.isEnabled,90 isExclusive: config.isExclusive,91 description: config.description,92 actions: [{93 actorId: config.actorId,94 actorTaskId: config.actorTaskId,95 input: config.input96 }]97 });98 99 log.info('Scheduler created successfully', { schedulerId: scheduler.id });100 return scheduler;101 } catch (error) {102 log.error('Failed to set up scheduler', { error: error.message });103 throw error;104 }105 }106 107 /**108 * Monitor the performance of scheduled runs109 * @param {number} limit Number of runs to retrieve110 * @returns {Promise<Array>} Recent runs with statistics111 */112 async monitorPerformance(limit = 10) {113 try {114 const client = Actor.newClient();115 const actorId = Actor.getEnv().actorId;116 117 if (!actorId) {118 log.warning('Cannot monitor performance: Actor ID not available');119 return [];120 }121 122 // Get recent runs123 const runs = await client.actor(actorId).runs().list({124 limit,125 desc: true126 });127 128 // Extract performance metrics129 const performanceStats = runs.items.map(run => ({130 id: run.id,131 status: run.status,132 startedAt: run.startedAt,133 finishedAt: run.finishedAt,134 durationSecs: run.finishedAt ? 135 (new Date(run.finishedAt) - new Date(run.startedAt)) / 1000 : 136 null,137 resourceUsage: {138 cpu: run.stats?.cpuUsage,139 memory: run.stats?.memoryUsage,140 },141 datasetStats: {142 itemCount: run.stats?.datasetsOutStats?.['default']?.itemCount || 0,143 totalBytes: run.stats?.datasetsOutStats?.['default']?.totalBytes || 0,144 },145 errorMessage: run.errorMessage146 }));147 148 log.info('Retrieved performance statistics', { 149 runCount: performanceStats.length,150 latestRun: performanceStats[0]151 });152 153 return performanceStats;154 } catch (error) {155 log.error('Failed to monitor performance', { error: error.message });156 return [];157 }158 }159}160
161/**162 * Create scheduler from Actor input163 * @returns {Promise<ScraperScheduler>} Scheduler instance164 */165export async function createScheduler() {166 const input = await Actor.getInput() || {};167 const { 168 scheduleInterval, 169 startTime, 170 startUrls 171 } = input;172 173 return new ScraperScheduler({174 scheduleInterval,175 startTime,176 startUrls177 });178}
src/schema.json
{ "title": "Sahibinden.com Car Listing Schema", "type": "object", "properties": { "id": { "type": "string", "description": "Unique identifier for the listing" }, "url": { "type": "string", "description": "Full URL of the listing" }, "title": { "type": "string", "description": "Title of the car listing" }, "price": { "type": ["number", "null"], "description": "Numeric price value (cleaned)" }, "price_currency": { "type": "string", "description": "Currency of the price (e.g., TL, EUR)" }, "location": { "type": "string", "description": "Location information (City/District/Neighborhood)" }, "description": { "type": "string", "description": "Full description text" }, "info": { "type": "object", "description": "Main information about the vehicle", "properties": { "İlan No": { "type": "string" }, "İlan Tarihi": { "type": "string" }, "Marka": { "type": "string" }, "Seri": { "type": "string" }, "Model": { "type": "string" }, "Yıl": { "type": "string" }, "Yakıt": { "type": "string" }, "Vites": { "type": "string" }, "KM": { "type": "string" }, "Kasa Tipi": { "type": "string" }, "Motor Gücü": { "type": "string" }, "Motor Hacmi": { "type": "string" }, "Çekiş": { "type": "string" }, "Kapı": { "type": "string" }, "Renk": { "type": "string" }, "Garanti": { "type": "string" }, "Ağır Hasar Kayıtlı": { "type": "string" }, "Plaka / Uyruk": { "type": "string" }, "Kimden": { "type": "string" }, "Takas": { "type": "string" }, "Durumu": { "type": "string" } }, "additionalProperties": true }, "images": { "type": "array", "description": "Array of image URLs", "items": { "type": "string" } }, "attributes": { "type": "object", "description": "Grouped attributes by section (e.g., Güvenlik, İç Donanım, Dış Donanım, Multimedya)", "additionalProperties": { "type": "array", "items": { "type": "string" } } }, "technicalSpecs": { "type": "object", "description": "Technical specifications grouped by section", "additionalProperties": { "type": "object", "additionalProperties": true } }, "scrapedAt": { "type": "string", "format": "date-time", "description": "Timestamp when the data was scraped" } }, "required": ["id", "url", "title", "scrapedAt"]}
src/utils.js
1import { randomBytes } from 'crypto';2
3/**4 * Generates a random user agent string for desktop browsers5 * @returns {string} A random user agent string6 */7export function randomUserAgent() {8 const browsers = [9 // Chrome on Windows10 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',11 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',12 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',13 14 // Chrome on macOS15 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',16 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',17 18 // Firefox on Windows19 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',20 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0',21 22 // Firefox on macOS23 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0',24 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:119.0) Gecko/20100101 Firefox/119.0',25 26 // Safari on macOS27 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',28 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15',29 30 // Edge on Windows31 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',32 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'33 ];34 35 return browsers[Math.floor(Math.random() * browsers.length)];36}37
38/**39 * Generates a random session ID40 * @returns {string} A random session ID41 */42export function generateSessionId() {43 return `session_${randomBytes(8).toString('hex')}`;44}45
46/**47 * Adds delay between actions to simulate human behavior48 * @param {number} min Minimum delay in milliseconds49 * @param {number} max Maximum delay in milliseconds50 * @returns {Promise<void>} A promise that resolves after the delay51 */52export function randomDelay(min = 1000, max = 5000) {53 const delay = Math.floor(Math.random() * (max - min + 1)) + min;54 return new Promise(resolve => setTimeout(resolve, delay));55}56
57/**58 * Formats price string to numeric value59 * @param {string} priceStr Price string (e.g., "150.000 TL")60 * @returns {number|null} Numeric price value or null if invalid61 */62export function formatPrice(priceStr) {63 if (!priceStr) return null;64 65 // Remove all non-numeric characters except decimal point66 const numericStr = priceStr.replace(/[^0-9,.]/g, '')67 .replace(/\./g, '') // Remove thousands separator (.)68 .replace(/,/g, '.'); // Replace comma with decimal point69 70 const price = parseFloat(numericStr);71 return isNaN(price) ? null : price;72}73
74/**75 * Extracts boolean value from Yes/No string in Turkish76 * @param {string} value String value (e.g., "Evet", "Hayır")77 * @returns {boolean|null} Boolean value or null if invalid78 */79export function parseYesNo(value) {80 if (!value) return null;81 82 const normalized = value.toLowerCase().trim();83 if (normalized === 'evet' || normalized === 'var') return true;84 if (normalized === 'hayır' || normalized === 'yok') return false;85 86 return null;87}88
89/**90 * Normalizes text by removing extra whitespace91 * @param {string} text Input text92 * @returns {string} Normalized text93 */94export function normalizeText(text) {95 if (!text) return '';96 return text.replace(/\s+/g, ' ').trim();97}
.dockerignore
# configurations.idea.vscode
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "root": true, "env": { "browser": true, "es2020": true, "node": true }, "extends": [ "@apify/eslint-config-ts" ], "parserOptions": { "project": "./tsconfig.json", "ecmaVersion": 2020 }, "ignorePatterns": [ "node_modules", "dist", "**/*.d.ts" ]}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.idea.vscode.zeddistnode_modulesapify_storagestorage
BASEROW_SETUP.md
1# BaseRow Setup Guide for Sahibinden.com Scraper2
3This guide provides detailed instructions for setting up a BaseRow table to store the car listings data scraped from Sahibinden.com.4
5## 1. Create a BaseRow Account6
7If you don't already have a BaseRow account:8
91. Go to [BaseRow.io](https://baserow.io/)102. Click "Sign Up" and follow the registration process113. Verify your email address12
13## 2. Create a New Database14
151. From your BaseRow dashboard, click "Create database"162. Name it "Sahibinden Car Listings" (or your preferred name)173. Click "Create"18
19## 3. Create a Table with the Required Schema20
211. In your new database, click "Create table"222. Name it "Car Listings"233. Click "Create"24
25## 4. Set Up Table Fields26
27Configure the following fields in your table:28
29| Field Name | Type | Description |30|------------|------|-------------|31| listing_id | Text | Unique identifier from Sahibinden.com |32| url | URL | Full URL of the listing |33| title | Text | Title of the car listing |34| price | Number | Numeric price value |35| price_currency | Text | Currency of the price (TL, EUR) |36| location | Text | Location information |37| description | Long text | Full description text |38| make | Text | Car make/brand |39| model | Text | Car model |40| series | Text | Car series |41| year | Text | Manufacturing year |42| fuel_type | Text | Type of fuel |43| transmission | Text | Transmission type |44| mileage | Text | Kilometer reading |45| body_type | Text | Body type |46| engine_power | Text | Engine power |47| engine_capacity | Text | Engine capacity |48| drive_type | Text | Drive type |49| doors | Text | Number of doors |50| color | Text | Car color |51| warranty | Text | Warranty information |52| damage_record | Text | Damage record status |53| plate_nationality | Text | Plate/nationality information |54| seller_type | Text | Seller type (dealer, individual) |55| trade_in | Text | Trade-in availability |56| condition | Text | Car condition |57| images | Long text | JSON string of image URLs |58| attributes | Long text | JSON string of attributes |59| technical_specs | Long text | JSON string of technical specs |60| scraped_at | Date & Time | When the data was scraped |61| last_updated | Date & Time | When the record was last updated |6263To add each field:641. Click the "+" button at the right end of the field headers652. Select the appropriate field type663. Enter the field name674. Click "Create"68
69## 5. Create API Token70
711. Click on your profile icon in the top-right corner722. Select "Account settings"733. Go to the "API tokens" tab744. Click "Create token"755. Name it "Sahibinden Scraper"766. Set appropriate permissions (at minimum: "Read database", "Create table rows", "Update table rows")777. Click "Create"788. Copy and save the generated token securely - you'll need it for the scraper configuration79
80## 6. Get Table and Database IDs81
82### Get Database ID:831. Go to your database842. Look at the URL in your browser853. The database ID is the number after `/database/` in the URL86 Example: `https://baserow.io/database/12345/table/67890` → Database ID is `12345`87
88### Get Table ID:891. Go to your table902. Look at the URL in your browser913. The table ID is the number after `/table/` in the URL92 Example: `https://baserow.io/database/12345/table/67890` → Table ID is `67890`93
94## 7. Configure the Scraper95
96Use the following information in your Apify Actor configuration:97
98```json99{100 "baseRowApiToken": "tpTwmGRP8bCXjRrItkeMqQnTsDkUYCYd",101 "baseRowTableId": "497942",102 "baseRowDatabaseId": "206595"103}104```105
106## 8. Test the Integration107
1081. Run the scraper with a small `maxRequestsPerCrawl` value (e.g., 10)1092. Check your BaseRow table to verify that data is being correctly stored1103. Verify that all fields are populated as expected111
112## 9. Set Up Views for AI Chatbot113
114To make it easier for your AI chatbot to query the data, create filtered views:115
1161. In your table, click "Create view"1172. Select "Grid view"1183. Name it appropriately (e.g., "Recent Listings")1194. Add filters as needed (e.g., filter by make, model, or date range)1205. Save the view121
122## 10. Handling Duplicates123
124The scraper is designed to handle duplicates by checking if a listing with the same ID already exists. If it does, the scraper will update the existing record rather than creating a new one.125
126## 11. Data Maintenance127
128Periodically:1291. Check for and remove any duplicate entries1302. Archive or delete very old listings that are no longer relevant1313. Verify data integrity, especially for fields that power your AI chatbot's price estimation132
133## 12. Troubleshooting134
135If data is not appearing in BaseRow:1361. Verify your API token has the correct permissions1372. Check that the table and database IDs are correct1383. Look at the Apify Actor logs for any API errors1394. Ensure your BaseRow account has not reached any usage limits
CHATBOT_INTEGRATION.md
1# AI Chatbot Integration Guide for Sahibinden.com Scraper2
3This guide provides instructions for integrating the scraped car listing data from BaseRow with an AI chatbot for used car price estimation.4
5## Overview6
7The AI chatbot will:81. Accept user descriptions of cars (e.g., "2017 Passat 3 parça boya 150bin km")92. Query BaseRow for comparable listings103. Calculate an estimated price range114. Return the estimate to the user with supporting data12
13## Data Access14
15### Direct BaseRow API Access16
17The AI chatbot can access the data directly from BaseRow using their API:18
19```python20import requests21import json22
23def query_baserow(api_token, table_id, filters=None):24 """25 Query BaseRow table with optional filters26 27 Args:28 api_token (str): BaseRow API token29 table_id (str): BaseRow table ID30 filters (dict): Optional filters to apply31 32 Returns:33 list: Matching car listings34 """35 headers = {36 'Authorization': f'Token {api_token}',37 'Content-Type': 'application/json'38 }39 40 url = f'https://api.baserow.io/api/database/rows/table/{table_id}/'41 42 params = {43 'user_field_names': 'true'44 }45 46 if filters:47 for key, value in filters.items():48 params[f'filter__{key}'] = value49 50 response = requests.get(url, headers=headers, params=params)51 52 if response.status_code == 200:53 return response.json()['results']54 else:55 raise Exception(f"Error querying BaseRow: {response.text}")56```57
58### Example Usage59
60```python61# Find similar cars based on make, model, and year62similar_cars = query_baserow(63 api_token='YOUR_API_TOKEN',64 table_id='YOUR_TABLE_ID',65 filters={66 'make__contains': 'Volkswagen',67 'model__contains': 'Passat',68 'year__contains': '2017'69 }70)71
72# Process results73for car in similar_cars:74 print(f"Title: {car['title']}")75 print(f"Price: {car['price']} {car['price_currency']}")76 print(f"Mileage: {car['mileage']}")77 print(f"Location: {car['location']}")78 print("---")79```80
81## Price Estimation Algorithm82
83Here's a simple algorithm for estimating car prices:84
85```python86def estimate_price(user_description, api_token, table_id):87 """88 Estimate car price based on user description89 90 Args:91 user_description (str): User's description of the car92 api_token (str): BaseRow API token93 table_id (str): BaseRow table ID94 95 Returns:96 dict: Price estimation and supporting data97 """98 # Extract key information from user description99 extracted_info = extract_car_info(user_description)100 101 # Query similar cars102 filters = {}103 if 'make' in extracted_info:104 filters['make__contains'] = extracted_info['make']105 if 'model' in extracted_info:106 filters['model__contains'] = extracted_info['model']107 if 'year' in extracted_info:108 filters['year__contains'] = extracted_info['year']109 110 similar_cars = query_baserow(api_token, table_id, filters)111 112 if not similar_cars:113 return {114 'status': 'no_matches',115 'message': 'No similar cars found in our database.'116 }117 118 # Calculate price statistics119 prices = [car['price'] for car in similar_cars if car['price']]120 121 if not prices:122 return {123 'status': 'no_prices',124 'message': 'Found similar cars but no valid price data.'125 }126 127 avg_price = sum(prices) / len(prices)128 min_price = min(prices)129 max_price = max(prices)130 131 # Adjust based on mileage if available132 if 'mileage' in extracted_info:133 user_mileage = extract_numeric(extracted_info['mileage'])134 if user_mileage:135 # Calculate average mileage136 mileages = [extract_numeric(car['mileage']) for car in similar_cars if car['mileage']]137 mileages = [m for m in mileages if m] # Filter out None values138 139 if mileages:140 avg_mileage = sum(mileages) / len(mileages)141 # Adjust price based on mileage difference142 if avg_mileage > 0:143 mileage_factor = 1 - ((user_mileage - avg_mileage) / avg_mileage) * 0.1144 avg_price = avg_price * mileage_factor145 146 # Adjust based on damage status if available147 if 'damage' in extracted_info:148 damage_status = extracted_info['damage']149 if damage_status:150 # Reduce price for damaged cars151 avg_price = avg_price * 0.85152 153 return {154 'status': 'success',155 'estimated_price': round(avg_price),156 'price_range': {157 'min': round(min_price),158 'max': round(max_price)159 },160 'similar_cars_count': len(similar_cars),161 'currency': similar_cars[0]['price_currency'] if similar_cars else 'TL',162 'similar_cars': similar_cars[:5] # Return top 5 similar cars163 }164
165def extract_car_info(description):166 """167 Extract car information from user description168 This is a simplified example - in a real implementation,169 you would use NLP techniques or a language model170 """171 info = {}172 173 # Extract make and model174 common_makes = ['Volkswagen', 'BMW', 'Mercedes', 'Audi', 'Toyota', 'Honda', 'Ford', 'Renault']175 for make in common_makes:176 if make.lower() in description.lower():177 info['make'] = make178 # Look for common models for this make179 if make == 'Volkswagen':180 models = ['Passat', 'Golf', 'Polo', 'Tiguan', 'Jetta']181 elif make == 'BMW':182 models = ['320', '520', 'X5', 'X3', 'M3', 'M5']183 # Add more makes and models as needed184 185 for model in models:186 if model.lower() in description.lower():187 info['model'] = model188 break189 break190 191 # Extract year (4-digit number between 1990 and current year)192 import re193 year_matches = re.findall(r'\b(19[9][0-9]|20[0-2][0-9])\b', description)194 if year_matches:195 info['year'] = year_matches[0]196 197 # Extract mileage (number followed by km, bin km, or similar)198 mileage_matches = re.findall(r'(\d+)(?:\s*(?:bin|k|b|000))?(?:\s*km)', description.lower())199 if mileage_matches:200 mileage = int(mileage_matches[0])201 # If the number is small, assume it's in thousands202 if mileage < 1000:203 mileage *= 1000204 info['mileage'] = str(mileage)205 206 # Extract damage information207 damage_keywords = ['hasar', 'boya', 'değişen', 'tramer']208 for keyword in damage_keywords:209 if keyword in description.lower():210 info['damage'] = True211 break212 213 return info214
215def extract_numeric(text):216 """Extract numeric value from text"""217 if not text:218 return None219 220 import re221 numbers = re.findall(r'\d+', text)222 if numbers:223 return int(''.join(numbers))224 return None225```226
227## Integration with Popular AI Frameworks228
229### OpenAI Integration230
231```python232import openai233
234def get_price_estimate_openai(user_query, api_token, table_id):235 # First extract structured data from the user query236 extraction_prompt = f"""237 Extract car information from the following user query:238 "{user_query}"239 240 Return a JSON object with the following fields if present:241 - make: Car manufacturer242 - model: Car model243 - year: Manufacturing year244 - mileage: Mileage in km245 - damage: Boolean indicating if damage is mentioned246 """247 248 extraction_response = openai.ChatCompletion.create(249 model="gpt-4",250 messages=[251 {"role": "system", "content": "You are a car information extraction assistant."},252 {"role": "user", "content": extraction_prompt}253 ]254 )255 256 extracted_info = json.loads(extraction_response.choices[0].message.content)257 258 # Query BaseRow for similar cars259 filters = {}260 if 'make' in extracted_info:261 filters['make__contains'] = extracted_info['make']262 if 'model' in extracted_info:263 filters['model__contains'] = extracted_info['model']264 if 'year' in extracted_info:265 filters['year__contains'] = extracted_info['year']266 267 similar_cars = query_baserow(api_token, table_id, filters)268 269 # Generate price estimate and response270 if not similar_cars:271 return "I couldn't find any similar cars in our database. Please provide more details about the car."272 273 # Calculate price statistics and prepare data for the AI274 prices = [car['price'] for car in similar_cars if car['price']]275 avg_price = sum(prices) / len(prices) if prices else 0276 min_price = min(prices) if prices else 0277 max_price = max(prices) if prices else 0278 279 # Prepare data for the AI to generate a response280 car_data = {281 "query": user_query,282 "extracted_info": extracted_info,283 "similar_cars_count": len(similar_cars),284 "price_stats": {285 "average": round(avg_price),286 "minimum": round(min_price),287 "maximum": round(max_price),288 "currency": similar_cars[0]['price_currency'] if similar_cars else "TL"289 },290 "example_listings": similar_cars[:3] # First 3 similar cars291 }292 293 # Generate a natural language response294 response_prompt = f"""295 Based on the following car data, provide a price estimate and explanation:296 {json.dumps(car_data, indent=2)}297 298 Your response should include:299 1. The estimated price range300 2. Factors that influence the price301 3. A brief explanation of how you arrived at this estimate302 4. Any caveats or additional information the user should know303 """304 305 final_response = openai.ChatCompletion.create(306 model="gpt-4",307 messages=[308 {"role": "system", "content": "You are a car price estimation assistant."},309 {"role": "user", "content": response_prompt}310 ]311 )312 313 return final_response.choices[0].message.content314```315
316## Handling Edge Cases317
318### Insufficient Data319
320When there aren't enough similar listings:321
322```python323def get_fallback_estimate(extracted_info, api_token, table_id):324 """Get a fallback estimate when exact matches aren't available"""325 # Try with just make326 if 'make' in extracted_info:327 similar_make = query_baserow(328 api_token, 329 table_id,330 {'make__contains': extracted_info['make']}331 )332 333 if similar_make:334 return {335 'status': 'partial_match',336 'message': f"Found {len(similar_make)} cars of the same make, but not the exact model or year.",337 'data': calculate_price_stats(similar_make)338 }339 340 # Try with just year range341 if 'year' in extracted_info:342 year = int(extracted_info['year'])343 year_range = query_baserow(344 api_token,345 table_id,346 {347 'year__gte': str(year - 2),348 'year__lte': str(year + 2)349 }350 )351 352 if year_range:353 return {354 'status': 'year_range_match',355 'message': f"Found {len(year_range)} cars from similar years ({year-2}-{year+2}).",356 'data': calculate_price_stats(year_range)357 }358 359 return {360 'status': 'no_data',361 'message': "Insufficient data to provide an estimate."362 }363```364
365### Handling Ambiguous Queries366
367For ambiguous queries, prompt the user for clarification:368
369```python370def handle_ambiguous_query(user_query, possible_matches):371 """Handle ambiguous car queries by asking for clarification"""372 if len(possible_matches) > 1:373 makes = set(car['make'] for car in possible_matches if 'make' in car)374 models = set(car['model'] for car in possible_matches if 'model' in car)375 376 clarification_message = "I found multiple possible matches. Could you specify which one you mean?\n\n"377 378 if len(makes) > 1:379 clarification_message += f"Makes: {', '.join(makes)}\n"380 381 if len(models) > 1:382 clarification_message += f"Models: {', '.join(models)}\n"383 384 return {385 'status': 'needs_clarification',386 'message': clarification_message,387 'options': [f"{car['make']} {car['model']} {car['year']}" for car in possible_matches[:5]]388 }389```390
391## Performance Optimization392
393For better performance with large datasets:394
3951. Create indexes on frequently queried fields in BaseRow3962. Cache common queries3973. Implement pagination for large result sets3984. Pre-process and aggregate data for common queries399
400## Conclusion401
402This integration guide provides the foundation for connecting your AI chatbot to the scraped Sahibinden.com data in BaseRow. The actual implementation may vary depending on your specific AI platform and requirements.403
404For best results:4051. Regularly update the scraped data4062. Fine-tune the price estimation algorithm based on user feedback4073. Expand the car information extraction to handle more complex queries4084. Consider implementing a feedback mechanism to improve estimates over time
package.json
{ "name": "sahibinden-scraper-puppeteer-js", "version": "1.0.0", "description": "Robust web scraper for Sahibinden.com using Puppeteer + Stealth (JavaScript ESM)", "main": "src/main.js", "type": "module", "dependencies": { "apify": "^3.1.0", "crawlee": "^3.13.0", "puppeteer": "*", "puppeteer-extra": "latest", "puppeteer-extra-plugin-stealth": "latest" }, "scripts": { "start": "node src/main.js", "start:prod": "node src/main.js" }, "author": "AI Assistant / User", "license": "ISC"}