sahibinden-scraper-puppeteer-js
Under maintenance
Pricing
Pay per usage
Go to Store
sahibinden-scraper-puppeteer-js
Under maintenance
0.0 (0)
Pricing
Pay per usage
1
Monthly users
11
Runs succeeded
>99%
Last modified
11 days ago
.actor/Dockerfile
1# Specify the base Docker image.
2FROM apify/actor-node-puppeteer-chrome:20 AS intermediate
3
4# Check preinstalled packages
5RUN npm ls crawlee apify puppeteer playwright
6
7# Copy just package.json and package-lock.json
8COPY package*.json ./
9
10# Install all dependencies. Don't audit to speed up the installation.
11# NOTE: No dev dependencies anymore, but install includes production ones.
12RUN npm install --include=dev --audit=false
13
14# Copy the source files using the user set
15# in the base image.
16COPY . ./
17
18# NO 'npm run build' STEP NEEDED HERE ANYMORE
19
20# Create final image
21FROM apify/actor-node-puppeteer-chrome:20
22
23# Check preinstalled packages
24RUN npm ls crawlee apify puppeteer playwright
25
26# Copy just package.json and package-lock.json
27COPY package*.json ./
28
29# Install ONLY production dependencies
30RUN npm --quiet set progress=false \
31 && npm install --omit=dev --omit=optional \
32 && echo "Installed NPM packages:" \
33 && (npm list --omit=dev --all || true) \
34 && echo "Node.js version:" \
35 && node --version \
36 && echo "NPM version:" \
37 && npm --version \
38 && rm -r ~/.npm
39
40# NO COPY from builder for /dist NEEDED HERE ANYMORE
41
42# Copy the remaining files and directories with the source code.
43COPY . ./
44
45# Run the image.
46CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Project Puppeteer Crawler Typescript",
5 "description": "Crawlee and Puppeteer project in typescript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "ts-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PuppeteerCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://apify.com"
14 }
15 ]
16 }
17 }
18}
src/baserow.js
1import axios from 'axios';
2import { Actor, log } from 'apify';
3
4/**
5 * BaseRow integration for storing scraped car listings
6 */
7export class BaseRowIntegration {
8 /**
9 * Initialize BaseRow integration
10 * @param {Object} options Configuration options
11 * @param {string} options.apiToken BaseRow API token
12 * @param {string} options.tableId BaseRow table ID
13 * @param {string} options.databaseId BaseRow database ID
14 */
15 constructor(options) {
16 const { apiToken, tableId, databaseId } = options;
17
18 if (!apiToken) throw new Error('BaseRow API token is required');
19 if (!tableId) throw new Error('BaseRow table ID is required');
20 if (!databaseId) throw new Error('BaseRow database ID is required');
21
22 this.apiToken = apiToken;
23 this.tableId = tableId;
24 this.databaseId = databaseId;
25 this.baseUrl = 'https://api.baserow.io/api';
26 this.client = axios.create({
27 baseURL: this.baseUrl,
28 headers: {
29 'Authorization': `Token ${this.apiToken}`,
30 'Content-Type': 'application/json'
31 }
32 });
33
34 log.info('BaseRow integration initialized', { tableId, databaseId });
35 }
36
37 /**
38 * Store a car listing in BaseRow
39 * @param {Object} carData Car listing data
40 * @returns {Promise<Object>} Created row data
41 */
42 async storeCarListing(carData) {
43 try {
44 // Prepare data for BaseRow
45 const rowData = this._prepareRowData(carData);
46
47 // Check if listing already exists to avoid duplicates
48 const existingRow = await this._findExistingListing(carData.id);
49
50 if (existingRow) {
51 log.info(`Updating existing listing: ${carData.id}`);
52 return await this._updateRow(existingRow.id, rowData);
53 } else {
54 log.info(`Creating new listing: ${carData.id}`);
55 return await this._createRow(rowData);
56 }
57 } catch (error) {
58 log.error(`Error storing car listing in BaseRow: ${error.message}`);
59 throw error;
60 }
61 }
62
63 /**
64 * Store multiple car listings in BaseRow
65 * @param {Array<Object>} carListings Array of car listing data
66 * @returns {Promise<Array<Object>>} Created/updated row data
67 */
68 async storeCarListings(carListings) {
69 log.info(`Storing ${carListings.length} car listings in BaseRow`);
70
71 const results = [];
72 for (const carData of carListings) {
73 try {
74 const result = await this.storeCarListing(carData);
75 results.push(result);
76 } catch (error) {
77 log.error(`Error storing car listing ${carData.id}: ${error.message}`);
78 // Continue with next listing
79 }
80 }
81
82 log.info(`Successfully stored ${results.length} out of ${carListings.length} car listings`);
83 return results;
84 }
85
86 /**
87 * Find an existing listing by ID
88 * @param {string} listingId Sahibinden.com listing ID
89 * @returns {Promise<Object|null>} Existing row or null if not found
90 * @private
91 */
92 async _findExistingListing(listingId) {
93 try {
94 const response = await this.client.get(
95 `/database/rows/table/${this.tableId}/`,
96 {
97 params: {
98 search: listingId,
99 user_field_names: true
100 }
101 }
102 );
103
104 const rows = response.data.results;
105 return rows.find(row => row.id === listingId || row.listing_id === listingId) || null;
106 } catch (error) {
107 log.error(`Error finding existing listing: ${error.message}`);
108 return null;
109 }
110 }
111
112 /**
113 * Create a new row in BaseRow
114 * @param {Object} rowData Row data
115 * @returns {Promise<Object>} Created row data
116 * @private
117 */
118 async _createRow(rowData) {
119 const response = await this.client.post(
120 `/database/rows/table/${this.tableId}/`,
121 rowData,
122 {
123 params: {
124 user_field_names: true
125 }
126 }
127 );
128
129 return response.data;
130 }
131
132 /**
133 * Update an existing row in BaseRow
134 * @param {number} rowId BaseRow row ID
135 * @param {Object} rowData Row data
136 * @returns {Promise<Object>} Updated row data
137 * @private
138 */
139 async _updateRow(rowId, rowData) {
140 const response = await this.client.patch(
141 `/database/rows/table/${this.tableId}/${rowId}/`,
142 rowData,
143 {
144 params: {
145 user_field_names: true
146 }
147 }
148 );
149
150 return response.data;
151 }
152
153 /**
154 * Prepare car data for BaseRow
155 * @param {Object} carData Car listing data
156 * @returns {Object} Prepared row data
157 * @private
158 */
159 _prepareRowData(carData) {
160 // Map car data to BaseRow fields
161 // Field names should match the BaseRow table structure
162 return {
163 listing_id: carData.id,
164 url: carData.url,
165 title: carData.title,
166 price: carData.price,
167 price_currency: carData.price_currency,
168 location: carData.location,
169 description: carData.description,
170
171 // Main info fields
172 make: carData.info?.Marka || '',
173 model: carData.info?.Model || '',
174 series: carData.info?.Seri || '',
175 year: carData.info?.Yıl || '',
176 fuel_type: carData.info?.Yakıt || '',
177 transmission: carData.info?.Vites || '',
178 mileage: carData.info?.KM || '',
179 body_type: carData.info?.['Kasa Tipi'] || '',
180 engine_power: carData.info?.['Motor Gücü'] || '',
181 engine_capacity: carData.info?.['Motor Hacmi'] || '',
182 drive_type: carData.info?.Çekiş || '',
183 doors: carData.info?.Kapı || '',
184 color: carData.info?.Renk || '',
185 warranty: carData.info?.Garanti || '',
186 damage_record: carData.info?.['Ağır Hasar Kayıtlı'] || '',
187 plate_nationality: carData.info?.['Plaka / Uyruk'] || '',
188 seller_type: carData.info?.Kimden || '',
189 trade_in: carData.info?.Takas || '',
190 condition: carData.info?.Durumu || '',
191
192 // Images as JSON string
193 images: JSON.stringify(carData.images || []),
194
195 // Attributes and technical specs as JSON strings
196 attributes: JSON.stringify(carData.attributes || {}),
197 technical_specs: JSON.stringify(carData.technicalSpecs || {}),
198
199 // Metadata
200 scraped_at: carData.scrapedAt,
201 last_updated: new Date().toISOString()
202 };
203 }
204}
205
206/**
207 * Create BaseRow integration from Actor input
208 * @returns {Promise<BaseRowIntegration>} BaseRow integration instance
209 */
210export async function createBaseRowIntegration() {
211 const input = await Actor.getInput() || {};
212 const { baseRowApiToken, baseRowTableId, baseRowDatabaseId } = input;
213
214 if (!baseRowApiToken || !baseRowTableId || !baseRowDatabaseId) {
215 log.warning('BaseRow integration not configured. Data will only be stored in Apify dataset.');
216 return null;
217 }
218
219 return new BaseRowIntegration({
220 apiToken: baseRowApiToken,
221 tableId: baseRowTableId,
222 databaseId: baseRowDatabaseId
223 });
224}
src/main.js
1// src/main.js - Puppeteer JavaScript Version for Bulk Category Scraping
2import { Actor } from 'apify';
3import { PuppeteerCrawler, log } from 'crawlee';
4// Import puppeteer-extra and the stealth plugin
5import puppeteer from 'puppeteer-extra';
6import StealthPlugin from 'puppeteer-extra-plugin-stealth';
7
8// Apply the stealth plugin to puppeteer
9puppeteer.use(StealthPlugin());
10
11// Initialize the Apify Actor
12await Actor.init();
13
14// Get input - simplified for bulk scraping
15const input = await Actor.getInput() || {};
16const {
17 startUrls = ['https://www.sahibinden.com/otomobil/elektrik'], // Specific category
18 maxItems = null, // Optional: Limit total listings scraped (null for all)
19 maxConcurrency = 5, // Can increase concurrency slightly for category pages
20 maxRequestsPerCrawl = 500, // Increase to handle many pages
21 proxyConfiguration = {
22 useApifyProxy: true,
23 apifyProxyGroups: ['RESIDENTIAL'],
24 }
25} = input;
26
27// Create the final proxy configuration object
28const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
29
30log.info('Starting BULK scraper', { startUrls, maxItems, maxConcurrency, maxRequestsPerCrawl });
31if (proxyConfig) {
32 log.info('Using proxy configuration', { type: proxyConfig.usesApifyProxy ? 'Apify Proxy' : 'Custom Proxies', groups: proxyConfig.apifyProxyGroups });
33} else {
34 log.info('No proxy configuration specified.');
35}
36
37let scrapedItemsCount = 0; // Counter for maxItems
38
39// --- Helper function for random User Agent (keep as before) ---
40function randomUserAgent() {
41 const userAgents = [ /* ... Fill with many user agents ... */ ];
42 return userAgents[Math.floor(Math.random() * userAgents.length)];
43}
44// -------------------------------------------
45
46
47// Create the Puppeteer crawler
48const crawler = new PuppeteerCrawler({
49 proxyConfiguration: proxyConfig,
50 maxConcurrency,
51 maxRequestsPerCrawl,
52 launchContext: {
53 launcher: puppeteer,
54 launchOptions: { headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] },
55 useChrome: false
56 },
57 preNavigationHooks: [ /* ... Keep your existing preNavigationHooks ... */ ],
58
59 requestHandler: async ({ page, request, enqueueLinks, log }) => {
60 log.info(`Processing page: ${request.url}`);
61 try {
62 // ... Keep Cloudflare check logic ...
63
64 await page.waitForSelector('body', { timeout: 45000 });
65
66 // --- Always handle as category page now ---
67 await handleCategoryPage(page, request, enqueueLinks, log);
68
69 } catch (error) {
70 let errorMessage = 'Unknown error during request handling';
71 if (error instanceof Error) { errorMessage = error.message; } else { errorMessage = String(error); }
72 log.error(`Error processing ${request.url}: ${errorMessage}`, { stack: error instanceof Error ? error.stack : undefined });
73 throw error;
74 }
75 },
76 failedRequestHandler: async ({ request, log }) => {
77 // ... Keep existing failedRequestHandler ...
78 },
79});
80
81// --- MODIFIED Handler Function for Category Pages ---
82
83async function handleCategoryPage(page, request, enqueueLinks, log) {
84 log.info(`Handling category page: ${request.url}`);
85
86 // --- VERIFY THESE SELECTORS on https://www.sahibinden.com/otomobil/elektrik ---
87 const listingRowSelector = 'tbody.searchResultsRowClass > tr.searchResultsItem'; // Row for each listing
88 // Selectors relative to the listingRowSelector element:
89 const makeSelector = 'td:nth-child(2)'; // Assuming Make is the 2nd column
90 const seriesSelector = 'td:nth-child(3)'; // Assuming Series is the 3rd column
91 const modelSelector = 'td:nth-child(4)'; // Assuming Model is the 4th column
92 const titleLinkSelector = 'td.searchResultsTitleValue a.classifiedTitle'; // Title link
93 const yearSelector = 'td:nth-child(7)'; // Assuming Year is the 7th column (VERIFY INDEX!)
94 const kmSelector = 'td:nth-child(8)'; // Assuming KM is the 8th column (VERIFY INDEX!)
95 const priceSelector = 'td.searchResultsPriceValue span'; // Price text
96 const dateSelector = 'td.searchResultsDateValue'; // Date column
97 const locationSelector = 'td.searchResultsLocationValue'; // Location column
98 // Selector for the next page link:
99 const nextPageSelector = 'a.prevNextBut[title="Sonraki"]:not(.passive)';
100 // -------------------------------------------------------------------------
101
102 try {
103 await page.waitForSelector(listingRowSelector, { timeout: 45000 });
104
105 const listingElements = await page.$$(listingRowSelector);
106 log.info(`Found ${listingElements.length} listings on page.`);
107
108 const results = []; // Collect results from this page
109
110 for (const element of listingElements) {
111
112 // Check if maxItems limit is reached
113 if (maxItems !== null && scrapedItemsCount >= maxItems) {
114 log.info(`Maximum items limit (${maxItems}) reached. Stopping scrape.`);
115 // Abort the crawl cleanly
116 await crawler.autoscaledPool.abort();
117 return; // Stop processing this page
118 }
119
120 try {
121 // Extract data relative to the current element 'element'
122 const make = await element.$eval(makeSelector, el => el.textContent?.trim()).catch(() => null);
123 const series = await element.$eval(seriesSelector, el => el.textContent?.trim()).catch(() => null);
124 const model = await element.$eval(modelSelector, el => el.textContent?.trim()).catch(() => null);
125 const titleElement = await element.$(titleLinkSelector); // Get handle for title/link
126 const title = await titleElement?.evaluate(el => el.textContent?.trim()).catch(() => null);
127 const detailUrl = await titleElement?.evaluate(el => el.href).catch(() => null);
128 const year = await element.$eval(yearSelector, el => el.textContent?.trim()).catch(() => null);
129 const km = await element.$eval(kmSelector, el => el.textContent?.trim()).catch(() => null);
130 const price = await element.$eval(priceSelector, el => el.textContent?.trim()).catch(() => null);
131 const date = await element.$eval(dateSelector, el => el.innerText?.trim().replace('\n', ' ')).catch(() => null); // Get innerText to handle <br>
132 const location = await element.$eval(locationSelector, el => el.innerText?.trim().replace('<br>', ' / ')).catch(() => null); // Get innerText and replace <br>
133
134 // Basic check if we got essential data
135 if (title && detailUrl) {
136 const listingData = {
137 // Match field names exactly as requested
138 'Marka': make,
139 'Seri': series,
140 'Model': model,
141 'İlan Başlığı': title,
142 'Yıl': year,
143 'KM': km, // Keep as string for now, cleaning later is easier
144 'Fiyat': price, // Keep as string
145 'İlan Tarihi': date,
146 'İl / İlçe': location,
147 'Detay Linki': detailUrl // Good to store the link too
148 };
149 results.push(listingData);
150 scrapedItemsCount++;
151 } else {
152 log.debug('Skipping row due to missing title or detailUrl.');
153 }
154
155 } catch (extractError) {
156 let errorMsg = 'Unknown error extracting item data';
157 if (extractError instanceof Error) { errorMsg = extractError.message; } else { errorMsg = String(extractError); }
158 log.warning(`Could not process one item fully on ${request.url}`, { error: errorMsg });
159 }
160 } // End of loop
161
162 // Push all results from this page
163 if (results.length > 0) {
164 await Actor.pushData(results);
165 log.info(`Pushed ${results.length} listings from page ${request.url}. Total scraped: ${scrapedItemsCount}`);
166 } else {
167 log.info(`No listings extracted from page ${request.url}.`);
168 }
169
170 // --- Enqueue next page ---
171 // Check if maxItems limit is reached before enqueueing next page
172 if (maxItems !== null && scrapedItemsCount >= maxItems) {
173 log.info(`Maximum items limit (${maxItems}) reached. Not enqueueing next page.`);
174 await crawler.autoscaledPool.abort(); // Ensure crawl stops
175 return;
176 }
177
178 const nextPageUrl = await page.$eval(nextPageSelector, anchor => anchor.href).catch(() => null);
179 if (nextPageUrl) {
180 log.info(`Enqueueing next category page: ${nextPageUrl}`);
181 const absoluteNextPageUrl = new URL(nextPageUrl, request.loadedUrl || request.url).toString();
182 // Add request with the same label 'CATEGORY'
183 // NOTE: No label needed if only one handler, but good practice
184 await enqueueLinks({
185 urls: [absoluteNextPageUrl],
186 userData: { label: 'CATEGORY' }, // Use label for consistency
187 });
188 } else {
189 log.info(`No next page button found or clickable on ${request.url}`);
190 }
191
192 } catch (error) {
193 let errorMessage = 'Unknown error handling category page';
194 if (error instanceof Error) { errorMessage = error.message; } else { errorMessage = String(error); }
195 log.warning(`Could not handle category page ${request.url}: ${errorMessage}`);
196 // Consider if this should throw error to retry the page
197 // throw error;
198 }
199}
200
201// --- handleDetailPage is no longer needed for this approach ---
202// async function handleDetailPage(page, request, log) { ... }
203
204
205// --- Start the Crawler ---
206const startRequests = startUrls.map(item => {
207 // ... (Keep the existing logic to handle string or object input for startUrls) ...
208 let urlString;
209 if (typeof item === 'string') { urlString = item; }
210 else if (item && typeof item.url === 'string') { urlString = item.url; }
211 else { log.warning('Skipping invalid start URL item:', { item }); return null; }
212
213 if (!urlString || !urlString.startsWith('http')) {
214 log.warning('Skipping item with invalid URL string:', { urlString }); return null;
215 }
216 return { url: urlString, userData: { label: 'CATEGORY' } }; // Always label as CATEGORY now
217}).filter(req => req !== null);
218
219
220if (startRequests.length > 0) {
221 await crawler.addRequests(startRequests);
222 log.info(`Added ${startRequests.length} initial requests to the queue.`);
223} else {
224 log.warning('No valid start URLs found in the input. Exiting.');
225 await Actor.exit(1, 'No valid start URLs provided.');
226}
227
228log.info('Starting the crawler...');
229await crawler.run();
230log.info(`Crawler finished. Total items scraped: ${scrapedItemsCount}`);
231
232await Actor.exit();
233// --- End of Script ---
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});
src/scheduler.js
1import { Actor } from 'apify';
2import { log } from 'crawlee';
3import { createBaseRowIntegration } from './baserow.js';
4
5/**
6 * Scheduler for automating the Sahibinden.com scraper
7 * This module handles the integration with Apify Scheduler
8 */
9export class ScraperScheduler {
10 /**
11 * Initialize the scheduler
12 * @param {Object} options Configuration options
13 * @param {string} options.scheduleInterval How often to run the scraper (e.g., 'daily', 'weekly')
14 * @param {string} options.startTime Time to start the scraper (e.g., '02:00')
15 * @param {Array<string>} options.startUrls URLs to start scraping from
16 */
17 constructor(options = {}) {
18 const {
19 scheduleInterval = 'daily',
20 startTime = '02:00',
21 startUrls = ['https://www.sahibinden.com/kategori/vasita']
22 } = options;
23
24 this.scheduleInterval = scheduleInterval;
25 this.startTime = startTime;
26 this.startUrls = startUrls;
27
28 log.info('Scheduler initialized', { scheduleInterval, startTime, startUrls });
29 }
30
31 /**
32 * Create an Apify scheduler configuration
33 * @returns {Object} Scheduler configuration
34 */
35 createSchedulerConfig() {
36 // Convert schedule interval to cron expression
37 let cronExpression;
38 switch (this.scheduleInterval.toLowerCase()) {
39 case 'hourly':
40 cronExpression = '0 * * * *';
41 break;
42 case 'daily':
43 const [hours, minutes] = this.startTime.split(':').map(Number);
44 cronExpression = `${minutes || 0} ${hours || 0} * * *`;
45 break;
46 case 'weekly':
47 const [weeklyHours, weeklyMinutes] = this.startTime.split(':').map(Number);
48 cronExpression = `${weeklyMinutes || 0} ${weeklyHours || 0} * * 1`; // Monday
49 break;
50 default:
51 cronExpression = '0 2 * * *'; // Default: 2 AM daily
52 }
53
54 return {
55 cronExpression,
56 isEnabled: true,
57 isExclusive: true,
58 description: 'Scheduled run of Sahibinden.com scraper',
59 actorTaskId: Actor.getEnv().actorTaskId || undefined,
60 actorId: Actor.getEnv().actorId || undefined,
61 input: {
62 startUrls: this.startUrls,
63 maxConcurrency: 1,
64 maxRequestsPerCrawl: 1000,
65 proxyConfiguration: {
66 useApifyProxy: true,
67 apifyProxyGroups: ['RESIDENTIAL'],
68 countryCode: 'TR'
69 }
70 }
71 };
72 }
73
74 /**
75 * Set up the scheduler on Apify platform
76 * @returns {Promise<Object>} Created scheduler
77 */
78 async setupScheduler() {
79 try {
80 const client = Actor.newClient();
81 const config = this.createSchedulerConfig();
82
83 log.info('Setting up scheduler with configuration', { cronExpression: config.cronExpression });
84
85 // Create a new scheduler
86 const scheduler = await client.schedules().create({
87 name: 'Sahibinden.com Scraper Schedule',
88 cronExpression: config.cronExpression,
89 isEnabled: config.isEnabled,
90 isExclusive: config.isExclusive,
91 description: config.description,
92 actions: [{
93 actorId: config.actorId,
94 actorTaskId: config.actorTaskId,
95 input: config.input
96 }]
97 });
98
99 log.info('Scheduler created successfully', { schedulerId: scheduler.id });
100 return scheduler;
101 } catch (error) {
102 log.error('Failed to set up scheduler', { error: error.message });
103 throw error;
104 }
105 }
106
107 /**
108 * Monitor the performance of scheduled runs
109 * @param {number} limit Number of runs to retrieve
110 * @returns {Promise<Array>} Recent runs with statistics
111 */
112 async monitorPerformance(limit = 10) {
113 try {
114 const client = Actor.newClient();
115 const actorId = Actor.getEnv().actorId;
116
117 if (!actorId) {
118 log.warning('Cannot monitor performance: Actor ID not available');
119 return [];
120 }
121
122 // Get recent runs
123 const runs = await client.actor(actorId).runs().list({
124 limit,
125 desc: true
126 });
127
128 // Extract performance metrics
129 const performanceStats = runs.items.map(run => ({
130 id: run.id,
131 status: run.status,
132 startedAt: run.startedAt,
133 finishedAt: run.finishedAt,
134 durationSecs: run.finishedAt ?
135 (new Date(run.finishedAt) - new Date(run.startedAt)) / 1000 :
136 null,
137 resourceUsage: {
138 cpu: run.stats?.cpuUsage,
139 memory: run.stats?.memoryUsage,
140 },
141 datasetStats: {
142 itemCount: run.stats?.datasetsOutStats?.['default']?.itemCount || 0,
143 totalBytes: run.stats?.datasetsOutStats?.['default']?.totalBytes || 0,
144 },
145 errorMessage: run.errorMessage
146 }));
147
148 log.info('Retrieved performance statistics', {
149 runCount: performanceStats.length,
150 latestRun: performanceStats[0]
151 });
152
153 return performanceStats;
154 } catch (error) {
155 log.error('Failed to monitor performance', { error: error.message });
156 return [];
157 }
158 }
159}
160
161/**
162 * Create scheduler from Actor input
163 * @returns {Promise<ScraperScheduler>} Scheduler instance
164 */
165export async function createScheduler() {
166 const input = await Actor.getInput() || {};
167 const {
168 scheduleInterval,
169 startTime,
170 startUrls
171 } = input;
172
173 return new ScraperScheduler({
174 scheduleInterval,
175 startTime,
176 startUrls
177 });
178}
src/schema.json
1{
2 "title": "Sahibinden.com Car Listing Schema",
3 "type": "object",
4 "properties": {
5 "id": {
6 "type": "string",
7 "description": "Unique identifier for the listing"
8 },
9 "url": {
10 "type": "string",
11 "description": "Full URL of the listing"
12 },
13 "title": {
14 "type": "string",
15 "description": "Title of the car listing"
16 },
17 "price": {
18 "type": ["number", "null"],
19 "description": "Numeric price value (cleaned)"
20 },
21 "price_currency": {
22 "type": "string",
23 "description": "Currency of the price (e.g., TL, EUR)"
24 },
25 "location": {
26 "type": "string",
27 "description": "Location information (City/District/Neighborhood)"
28 },
29 "description": {
30 "type": "string",
31 "description": "Full description text"
32 },
33 "info": {
34 "type": "object",
35 "description": "Main information about the vehicle",
36 "properties": {
37 "İlan No": { "type": "string" },
38 "İlan Tarihi": { "type": "string" },
39 "Marka": { "type": "string" },
40 "Seri": { "type": "string" },
41 "Model": { "type": "string" },
42 "Yıl": { "type": "string" },
43 "Yakıt": { "type": "string" },
44 "Vites": { "type": "string" },
45 "KM": { "type": "string" },
46 "Kasa Tipi": { "type": "string" },
47 "Motor Gücü": { "type": "string" },
48 "Motor Hacmi": { "type": "string" },
49 "Çekiş": { "type": "string" },
50 "Kapı": { "type": "string" },
51 "Renk": { "type": "string" },
52 "Garanti": { "type": "string" },
53 "Ağır Hasar Kayıtlı": { "type": "string" },
54 "Plaka / Uyruk": { "type": "string" },
55 "Kimden": { "type": "string" },
56 "Takas": { "type": "string" },
57 "Durumu": { "type": "string" }
58 },
59 "additionalProperties": true
60 },
61 "images": {
62 "type": "array",
63 "description": "Array of image URLs",
64 "items": {
65 "type": "string"
66 }
67 },
68 "attributes": {
69 "type": "object",
70 "description": "Grouped attributes by section (e.g., Güvenlik, İç Donanım, Dış Donanım, Multimedya)",
71 "additionalProperties": {
72 "type": "array",
73 "items": {
74 "type": "string"
75 }
76 }
77 },
78 "technicalSpecs": {
79 "type": "object",
80 "description": "Technical specifications grouped by section",
81 "additionalProperties": {
82 "type": "object",
83 "additionalProperties": true
84 }
85 },
86 "scrapedAt": {
87 "type": "string",
88 "format": "date-time",
89 "description": "Timestamp when the data was scraped"
90 }
91 },
92 "required": ["id", "url", "title", "scrapedAt"]
93}
src/utils.js
1import { randomBytes } from 'crypto';
2
3/**
4 * Generates a random user agent string for desktop browsers
5 * @returns {string} A random user agent string
6 */
7export function randomUserAgent() {
8 const browsers = [
9 // Chrome on Windows
10 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
11 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
12 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
13
14 // Chrome on macOS
15 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
16 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
17
18 // Firefox on Windows
19 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
20 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0',
21
22 // Firefox on macOS
23 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0',
24 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:119.0) Gecko/20100101 Firefox/119.0',
25
26 // Safari on macOS
27 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
28 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15',
29
30 // Edge on Windows
31 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
32 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
33 ];
34
35 return browsers[Math.floor(Math.random() * browsers.length)];
36}
37
38/**
39 * Generates a random session ID
40 * @returns {string} A random session ID
41 */
42export function generateSessionId() {
43 return `session_${randomBytes(8).toString('hex')}`;
44}
45
46/**
47 * Adds delay between actions to simulate human behavior
48 * @param {number} min Minimum delay in milliseconds
49 * @param {number} max Maximum delay in milliseconds
50 * @returns {Promise<void>} A promise that resolves after the delay
51 */
52export function randomDelay(min = 1000, max = 5000) {
53 const delay = Math.floor(Math.random() * (max - min + 1)) + min;
54 return new Promise(resolve => setTimeout(resolve, delay));
55}
56
57/**
58 * Formats price string to numeric value
59 * @param {string} priceStr Price string (e.g., "150.000 TL")
60 * @returns {number|null} Numeric price value or null if invalid
61 */
62export function formatPrice(priceStr) {
63 if (!priceStr) return null;
64
65 // Remove all non-numeric characters except decimal point
66 const numericStr = priceStr.replace(/[^0-9,.]/g, '')
67 .replace(/\./g, '') // Remove thousands separator (.)
68 .replace(/,/g, '.'); // Replace comma with decimal point
69
70 const price = parseFloat(numericStr);
71 return isNaN(price) ? null : price;
72}
73
74/**
75 * Extracts boolean value from Yes/No string in Turkish
76 * @param {string} value String value (e.g., "Evet", "Hayır")
77 * @returns {boolean|null} Boolean value or null if invalid
78 */
79export function parseYesNo(value) {
80 if (!value) return null;
81
82 const normalized = value.toLowerCase().trim();
83 if (normalized === 'evet' || normalized === 'var') return true;
84 if (normalized === 'hayır' || normalized === 'yok') return false;
85
86 return null;
87}
88
89/**
90 * Normalizes text by removing extra whitespace
91 * @param {string} text Input text
92 * @returns {string} Normalized text
93 */
94export function normalizeText(text) {
95 if (!text) return '';
96 return text.replace(/\s+/g, ' ').trim();
97}
.dockerignore
1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "root": true,
3 "env": {
4 "browser": true,
5 "es2020": true,
6 "node": true
7 },
8 "extends": [
9 "@apify/eslint-config-ts"
10 ],
11 "parserOptions": {
12 "project": "./tsconfig.json",
13 "ecmaVersion": 2020
14 },
15 "ignorePatterns": [
16 "node_modules",
17 "dist",
18 "**/*.d.ts"
19 ]
20}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6.zed
7dist
8node_modules
9apify_storage
10storage
BASEROW_SETUP.md
1# BaseRow Setup Guide for Sahibinden.com Scraper
2
3This guide provides detailed instructions for setting up a BaseRow table to store the car listings data scraped from Sahibinden.com.
4
5## 1. Create a BaseRow Account
6
7If you don't already have a BaseRow account:
8
91. Go to [BaseRow.io](https://baserow.io/)
102. Click "Sign Up" and follow the registration process
113. Verify your email address
12
13## 2. Create a New Database
14
151. From your BaseRow dashboard, click "Create database"
162. Name it "Sahibinden Car Listings" (or your preferred name)
173. Click "Create"
18
19## 3. Create a Table with the Required Schema
20
211. In your new database, click "Create table"
222. Name it "Car Listings"
233. Click "Create"
24
25## 4. Set Up Table Fields
26
27Configure the following fields in your table:
28
29| Field Name | Type | Description |
30|------------|------|-------------|
31| listing_id | Text | Unique identifier from Sahibinden.com |
32| url | URL | Full URL of the listing |
33| title | Text | Title of the car listing |
34| price | Number | Numeric price value |
35| price_currency | Text | Currency of the price (TL, EUR) |
36| location | Text | Location information |
37| description | Long text | Full description text |
38| make | Text | Car make/brand |
39| model | Text | Car model |
40| series | Text | Car series |
41| year | Text | Manufacturing year |
42| fuel_type | Text | Type of fuel |
43| transmission | Text | Transmission type |
44| mileage | Text | Kilometer reading |
45| body_type | Text | Body type |
46| engine_power | Text | Engine power |
47| engine_capacity | Text | Engine capacity |
48| drive_type | Text | Drive type |
49| doors | Text | Number of doors |
50| color | Text | Car color |
51| warranty | Text | Warranty information |
52| damage_record | Text | Damage record status |
53| plate_nationality | Text | Plate/nationality information |
54| seller_type | Text | Seller type (dealer, individual) |
55| trade_in | Text | Trade-in availability |
56| condition | Text | Car condition |
57| images | Long text | JSON string of image URLs |
58| attributes | Long text | JSON string of attributes |
59| technical_specs | Long text | JSON string of technical specs |
60| scraped_at | Date & Time | When the data was scraped |
61| last_updated | Date & Time | When the record was last updated |
62
63To add each field:
641. Click the "+" button at the right end of the field headers
652. Select the appropriate field type
663. Enter the field name
674. Click "Create"
68
69## 5. Create API Token
70
711. Click on your profile icon in the top-right corner
722. Select "Account settings"
733. Go to the "API tokens" tab
744. Click "Create token"
755. Name it "Sahibinden Scraper"
766. Set appropriate permissions (at minimum: "Read database", "Create table rows", "Update table rows")
777. Click "Create"
788. Copy and save the generated token securely - you'll need it for the scraper configuration
79
80## 6. Get Table and Database IDs
81
82### Get Database ID:
831. Go to your database
842. Look at the URL in your browser
853. The database ID is the number after `/database/` in the URL
86 Example: `https://baserow.io/database/12345/table/67890` → Database ID is `12345`
87
88### Get Table ID:
891. Go to your table
902. Look at the URL in your browser
913. The table ID is the number after `/table/` in the URL
92 Example: `https://baserow.io/database/12345/table/67890` → Table ID is `67890`
93
94## 7. Configure the Scraper
95
96Use the following information in your Apify Actor configuration:
97
98```json
99{
100 "baseRowApiToken": "tpTwmGRP8bCXjRrItkeMqQnTsDkUYCYd",
101 "baseRowTableId": "497942",
102 "baseRowDatabaseId": "206595"
103}
104```
105
106## 8. Test the Integration
107
1081. Run the scraper with a small `maxRequestsPerCrawl` value (e.g., 10)
1092. Check your BaseRow table to verify that data is being correctly stored
1103. Verify that all fields are populated as expected
111
112## 9. Set Up Views for AI Chatbot
113
114To make it easier for your AI chatbot to query the data, create filtered views:
115
1161. In your table, click "Create view"
1172. Select "Grid view"
1183. Name it appropriately (e.g., "Recent Listings")
1194. Add filters as needed (e.g., filter by make, model, or date range)
1205. Save the view
121
122## 10. Handling Duplicates
123
124The scraper is designed to handle duplicates by checking if a listing with the same ID already exists. If it does, the scraper will update the existing record rather than creating a new one.
125
126## 11. Data Maintenance
127
128Periodically:
1291. Check for and remove any duplicate entries
1302. Archive or delete very old listings that are no longer relevant
1313. Verify data integrity, especially for fields that power your AI chatbot's price estimation
132
133## 12. Troubleshooting
134
135If data is not appearing in BaseRow:
1361. Verify your API token has the correct permissions
1372. Check that the table and database IDs are correct
1383. Look at the Apify Actor logs for any API errors
1394. Ensure your BaseRow account has not reached any usage limits
CHATBOT_INTEGRATION.md
1# AI Chatbot Integration Guide for Sahibinden.com Scraper
2
3This guide provides instructions for integrating the scraped car listing data from BaseRow with an AI chatbot for used car price estimation.
4
5## Overview
6
7The AI chatbot will:
81. Accept user descriptions of cars (e.g., "2017 Passat 3 parça boya 150bin km")
92. Query BaseRow for comparable listings
103. Calculate an estimated price range
114. Return the estimate to the user with supporting data
12
13## Data Access
14
15### Direct BaseRow API Access
16
17The AI chatbot can access the data directly from BaseRow using their API:
18
19```python
20import requests
21import json
22
23def query_baserow(api_token, table_id, filters=None):
24 """
25 Query BaseRow table with optional filters
26
27 Args:
28 api_token (str): BaseRow API token
29 table_id (str): BaseRow table ID
30 filters (dict): Optional filters to apply
31
32 Returns:
33 list: Matching car listings
34 """
35 headers = {
36 'Authorization': f'Token {api_token}',
37 'Content-Type': 'application/json'
38 }
39
40 url = f'https://api.baserow.io/api/database/rows/table/{table_id}/'
41
42 params = {
43 'user_field_names': 'true'
44 }
45
46 if filters:
47 for key, value in filters.items():
48 params[f'filter__{key}'] = value
49
50 response = requests.get(url, headers=headers, params=params)
51
52 if response.status_code == 200:
53 return response.json()['results']
54 else:
55 raise Exception(f"Error querying BaseRow: {response.text}")
56```
57
58### Example Usage
59
60```python
61# Find similar cars based on make, model, and year
62similar_cars = query_baserow(
63 api_token='YOUR_API_TOKEN',
64 table_id='YOUR_TABLE_ID',
65 filters={
66 'make__contains': 'Volkswagen',
67 'model__contains': 'Passat',
68 'year__contains': '2017'
69 }
70)
71
72# Process results
73for car in similar_cars:
74 print(f"Title: {car['title']}")
75 print(f"Price: {car['price']} {car['price_currency']}")
76 print(f"Mileage: {car['mileage']}")
77 print(f"Location: {car['location']}")
78 print("---")
79```
80
81## Price Estimation Algorithm
82
83Here's a simple algorithm for estimating car prices:
84
85```python
86def estimate_price(user_description, api_token, table_id):
87 """
88 Estimate car price based on user description
89
90 Args:
91 user_description (str): User's description of the car
92 api_token (str): BaseRow API token
93 table_id (str): BaseRow table ID
94
95 Returns:
96 dict: Price estimation and supporting data
97 """
98 # Extract key information from user description
99 extracted_info = extract_car_info(user_description)
100
101 # Query similar cars
102 filters = {}
103 if 'make' in extracted_info:
104 filters['make__contains'] = extracted_info['make']
105 if 'model' in extracted_info:
106 filters['model__contains'] = extracted_info['model']
107 if 'year' in extracted_info:
108 filters['year__contains'] = extracted_info['year']
109
110 similar_cars = query_baserow(api_token, table_id, filters)
111
112 if not similar_cars:
113 return {
114 'status': 'no_matches',
115 'message': 'No similar cars found in our database.'
116 }
117
118 # Calculate price statistics
119 prices = [car['price'] for car in similar_cars if car['price']]
120
121 if not prices:
122 return {
123 'status': 'no_prices',
124 'message': 'Found similar cars but no valid price data.'
125 }
126
127 avg_price = sum(prices) / len(prices)
128 min_price = min(prices)
129 max_price = max(prices)
130
131 # Adjust based on mileage if available
132 if 'mileage' in extracted_info:
133 user_mileage = extract_numeric(extracted_info['mileage'])
134 if user_mileage:
135 # Calculate average mileage
136 mileages = [extract_numeric(car['mileage']) for car in similar_cars if car['mileage']]
137 mileages = [m for m in mileages if m] # Filter out None values
138
139 if mileages:
140 avg_mileage = sum(mileages) / len(mileages)
141 # Adjust price based on mileage difference
142 if avg_mileage > 0:
143 mileage_factor = 1 - ((user_mileage - avg_mileage) / avg_mileage) * 0.1
144 avg_price = avg_price * mileage_factor
145
146 # Adjust based on damage status if available
147 if 'damage' in extracted_info:
148 damage_status = extracted_info['damage']
149 if damage_status:
150 # Reduce price for damaged cars
151 avg_price = avg_price * 0.85
152
153 return {
154 'status': 'success',
155 'estimated_price': round(avg_price),
156 'price_range': {
157 'min': round(min_price),
158 'max': round(max_price)
159 },
160 'similar_cars_count': len(similar_cars),
161 'currency': similar_cars[0]['price_currency'] if similar_cars else 'TL',
162 'similar_cars': similar_cars[:5] # Return top 5 similar cars
163 }
164
165def extract_car_info(description):
166 """
167 Extract car information from user description
168 This is a simplified example - in a real implementation,
169 you would use NLP techniques or a language model
170 """
171 info = {}
172
173 # Extract make and model
174 common_makes = ['Volkswagen', 'BMW', 'Mercedes', 'Audi', 'Toyota', 'Honda', 'Ford', 'Renault']
175 for make in common_makes:
176 if make.lower() in description.lower():
177 info['make'] = make
178 # Look for common models for this make
179 if make == 'Volkswagen':
180 models = ['Passat', 'Golf', 'Polo', 'Tiguan', 'Jetta']
181 elif make == 'BMW':
182 models = ['320', '520', 'X5', 'X3', 'M3', 'M5']
183 # Add more makes and models as needed
184
185 for model in models:
186 if model.lower() in description.lower():
187 info['model'] = model
188 break
189 break
190
191 # Extract year (4-digit number between 1990 and current year)
192 import re
193 year_matches = re.findall(r'\b(19[9][0-9]|20[0-2][0-9])\b', description)
194 if year_matches:
195 info['year'] = year_matches[0]
196
197 # Extract mileage (number followed by km, bin km, or similar)
198 mileage_matches = re.findall(r'(\d+)(?:\s*(?:bin|k|b|000))?(?:\s*km)', description.lower())
199 if mileage_matches:
200 mileage = int(mileage_matches[0])
201 # If the number is small, assume it's in thousands
202 if mileage < 1000:
203 mileage *= 1000
204 info['mileage'] = str(mileage)
205
206 # Extract damage information
207 damage_keywords = ['hasar', 'boya', 'değişen', 'tramer']
208 for keyword in damage_keywords:
209 if keyword in description.lower():
210 info['damage'] = True
211 break
212
213 return info
214
215def extract_numeric(text):
216 """Extract numeric value from text"""
217 if not text:
218 return None
219
220 import re
221 numbers = re.findall(r'\d+', text)
222 if numbers:
223 return int(''.join(numbers))
224 return None
225```
226
227## Integration with Popular AI Frameworks
228
229### OpenAI Integration
230
231```python
232import openai
233
234def get_price_estimate_openai(user_query, api_token, table_id):
235 # First extract structured data from the user query
236 extraction_prompt = f"""
237 Extract car information from the following user query:
238 "{user_query}"
239
240 Return a JSON object with the following fields if present:
241 - make: Car manufacturer
242 - model: Car model
243 - year: Manufacturing year
244 - mileage: Mileage in km
245 - damage: Boolean indicating if damage is mentioned
246 """
247
248 extraction_response = openai.ChatCompletion.create(
249 model="gpt-4",
250 messages=[
251 {"role": "system", "content": "You are a car information extraction assistant."},
252 {"role": "user", "content": extraction_prompt}
253 ]
254 )
255
256 extracted_info = json.loads(extraction_response.choices[0].message.content)
257
258 # Query BaseRow for similar cars
259 filters = {}
260 if 'make' in extracted_info:
261 filters['make__contains'] = extracted_info['make']
262 if 'model' in extracted_info:
263 filters['model__contains'] = extracted_info['model']
264 if 'year' in extracted_info:
265 filters['year__contains'] = extracted_info['year']
266
267 similar_cars = query_baserow(api_token, table_id, filters)
268
269 # Generate price estimate and response
270 if not similar_cars:
271 return "I couldn't find any similar cars in our database. Please provide more details about the car."
272
273 # Calculate price statistics and prepare data for the AI
274 prices = [car['price'] for car in similar_cars if car['price']]
275 avg_price = sum(prices) / len(prices) if prices else 0
276 min_price = min(prices) if prices else 0
277 max_price = max(prices) if prices else 0
278
279 # Prepare data for the AI to generate a response
280 car_data = {
281 "query": user_query,
282 "extracted_info": extracted_info,
283 "similar_cars_count": len(similar_cars),
284 "price_stats": {
285 "average": round(avg_price),
286 "minimum": round(min_price),
287 "maximum": round(max_price),
288 "currency": similar_cars[0]['price_currency'] if similar_cars else "TL"
289 },
290 "example_listings": similar_cars[:3] # First 3 similar cars
291 }
292
293 # Generate a natural language response
294 response_prompt = f"""
295 Based on the following car data, provide a price estimate and explanation:
296 {json.dumps(car_data, indent=2)}
297
298 Your response should include:
299 1. The estimated price range
300 2. Factors that influence the price
301 3. A brief explanation of how you arrived at this estimate
302 4. Any caveats or additional information the user should know
303 """
304
305 final_response = openai.ChatCompletion.create(
306 model="gpt-4",
307 messages=[
308 {"role": "system", "content": "You are a car price estimation assistant."},
309 {"role": "user", "content": response_prompt}
310 ]
311 )
312
313 return final_response.choices[0].message.content
314```
315
316## Handling Edge Cases
317
318### Insufficient Data
319
320When there aren't enough similar listings:
321
322```python
323def get_fallback_estimate(extracted_info, api_token, table_id):
324 """Get a fallback estimate when exact matches aren't available"""
325 # Try with just make
326 if 'make' in extracted_info:
327 similar_make = query_baserow(
328 api_token,
329 table_id,
330 {'make__contains': extracted_info['make']}
331 )
332
333 if similar_make:
334 return {
335 'status': 'partial_match',
336 'message': f"Found {len(similar_make)} cars of the same make, but not the exact model or year.",
337 'data': calculate_price_stats(similar_make)
338 }
339
340 # Try with just year range
341 if 'year' in extracted_info:
342 year = int(extracted_info['year'])
343 year_range = query_baserow(
344 api_token,
345 table_id,
346 {
347 'year__gte': str(year - 2),
348 'year__lte': str(year + 2)
349 }
350 )
351
352 if year_range:
353 return {
354 'status': 'year_range_match',
355 'message': f"Found {len(year_range)} cars from similar years ({year-2}-{year+2}).",
356 'data': calculate_price_stats(year_range)
357 }
358
359 return {
360 'status': 'no_data',
361 'message': "Insufficient data to provide an estimate."
362 }
363```
364
365### Handling Ambiguous Queries
366
367For ambiguous queries, prompt the user for clarification:
368
369```python
370def handle_ambiguous_query(user_query, possible_matches):
371 """Handle ambiguous car queries by asking for clarification"""
372 if len(possible_matches) > 1:
373 makes = set(car['make'] for car in possible_matches if 'make' in car)
374 models = set(car['model'] for car in possible_matches if 'model' in car)
375
376 clarification_message = "I found multiple possible matches. Could you specify which one you mean?\n\n"
377
378 if len(makes) > 1:
379 clarification_message += f"Makes: {', '.join(makes)}\n"
380
381 if len(models) > 1:
382 clarification_message += f"Models: {', '.join(models)}\n"
383
384 return {
385 'status': 'needs_clarification',
386 'message': clarification_message,
387 'options': [f"{car['make']} {car['model']} {car['year']}" for car in possible_matches[:5]]
388 }
389```
390
391## Performance Optimization
392
393For better performance with large datasets:
394
3951. Create indexes on frequently queried fields in BaseRow
3962. Cache common queries
3973. Implement pagination for large result sets
3984. Pre-process and aggregate data for common queries
399
400## Conclusion
401
402This integration guide provides the foundation for connecting your AI chatbot to the scraped Sahibinden.com data in BaseRow. The actual implementation may vary depending on your specific AI platform and requirements.
403
404For best results:
4051. Regularly update the scraped data
4062. Fine-tune the price estimation algorithm based on user feedback
4073. Expand the car information extraction to handle more complex queries
4084. Consider implementing a feedback mechanism to improve estimates over time
package.json
1{
2 "name": "sahibinden-scraper-puppeteer-js",
3 "version": "1.0.0",
4 "description": "Robust web scraper for Sahibinden.com using Puppeteer + Stealth (JavaScript ESM)",
5 "main": "src/main.js",
6 "type": "module",
7 "dependencies": {
8 "apify": "^3.1.0",
9 "crawlee": "^3.13.0",
10 "puppeteer": "*",
11 "puppeteer-extra": "latest",
12 "puppeteer-extra-plugin-stealth": "latest"
13 },
14 "scripts": {
15 "start": "node src/main.js",
16 "start:prod": "node src/main.js"
17 },
18 "author": "AI Assistant / User",
19 "license": "ISC"
20}
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.