1
2
3
4
5
6import { Actor } from 'apify';
7import { PlaywrightCrawler, Dataset } from 'crawlee';
8
9
10function cleanText(text) {
11 if (!text || typeof text !== 'string') return '';
12 return text
13
14 .replace(/[\u{1F000}-\u{1FFFF}]/gu, '')
15
16 .replace(/[\u{2000}-\u{2FFF}]/gu, '')
17
18 .replace(/[\u0000-\u001F\u007F-\u00A0\u1680\u180E\u2000-\u200F\u2028-\u202F\u205F-\u206F\uFEFF]/g, '')
19
20 .replace(/[\uFE00-\uFE0F]/g, '')
21
22 .replace(/[^\x20-\x7E\xA0-\xFF\u0100-\u017F\u0180-\u024F]/g, '')
23
24 .replace(/,?\s*Copy open hours/gi, '')
25
26 .replace(/\s+/g, ' ')
27
28 .trim();
29}
30
31function cleanHours(text) {
32 if (!text || typeof text !== 'string') return '';
33 return text
34
35 .replace(/[·•]\s*See more hours?/gi, '')
36 .replace(/See more hours?[·•]?/gi, '')
37
38 .replace(/[·•,]?\s*Copy open hours?/gi, '')
39
40 .replace(/^Open\s*[·•]\s*/i, '')
41
42 .replace(/Closes soon\s*[·•]\s*/gi, '')
43
44 .replace(/[·•]\s*Opens \d+[AP]M \w+/gi, '')
45
46 .replace(/[·•]+/g, '·')
47
48 .replace(/\s+/g, ' ')
49 .trim();
50}
51
52await Actor.init();
53
54
55await Actor.setValue('START_TIME', Date.now());
56
57
58const input = await Actor.getInput() ?? {};
59const {
60 searchQueries = ['coffee shops in San Francisco'],
61 maxResults = 20,
62 includeReviews = false,
63 includeImages = true,
64 language = 'en',
65 maxConcurrency = 5,
66} = input;
67
68
69if (!Array.isArray(searchQueries) || searchQueries.length === 0) {
70 throw new Error('❌ At least one search query is required. Please provide search terms like "restaurants in New York"');
71}
72
73if (maxResults < 1 || maxResults > 500) {
74 throw new Error('❌ maxResults must be between 1 and 500');
75}
76
77
78const validQueries = searchQueries.filter(q => q && q.trim().length > 0);
79if (validQueries.length === 0) {
80 throw new Error('❌ All search queries are empty. Please provide valid search terms.');
81}
82
83console.log(`🚀 Starting Google Maps Business Scraper`);
84console.log(`📊 Queries: ${validQueries.length} | Max results per query: ${maxResults}`);
85console.log(`⚙️ Include reviews: ${includeReviews} | Include images: ${includeImages}`);
86
87
88const startUrls = validQueries.map(query => {
89 const encodedQuery = encodeURIComponent(query);
90 return {
91 url: `https://www.google.com/maps/search/${encodedQuery}?hl=${language}`,
92 userData: { label: 'SEARCH', query },
93 };
94});
95
96
97const proxyConfiguration = await Actor.createProxyConfiguration({
98 groups: ['RESIDENTIAL'],
99 useApifyProxy: true,
100});
101
102const crawler = new PlaywrightCrawler({
103 proxyConfiguration,
104 maxConcurrency,
105
106 browserPoolOptions: {
107 useFingerprints: true,
108 fingerprintOptions: {
109 fingerprintGeneratorOptions: {
110 browsers: ['chrome'],
111 devices: ['desktop'],
112 operatingSystems: ['windows', 'macos'],
113 },
114 },
115 },
116 launchContext: {
117 launchOptions: {
118 headless: true,
119 },
120
121 ...(Actor.isAtHome() && {
122 useChrome: true,
123 launchOptions: {
124 devtools: false,
125 args: [
126 '--disable-blink-features=AutomationControlled',
127 '--disable-web-security',
128 ],
129 },
130 }),
131 },
132
133 navigationTimeoutSecs: 60,
134 requestHandlerTimeoutSecs: 120,
135 maxRequestRetries: 3,
136 maxRequestsPerCrawl: maxResults * validQueries.length + 100,
137 async requestHandler({ page, request, log }) {
138 const { label, query, businessUrl } = request.userData;
139
140 if (label === 'SEARCH') {
141 log.info(`Processing search: ${query}`);
142
143
144 await page.waitForSelector('div[role="feed"]', { timeout: 30000 });
145 await page.waitForTimeout(3000);
146
147
148 const resultsSelector = 'div[role="feed"]';
149 await autoScroll(page, resultsSelector, maxResults);
150
151
152 const businessLinks = await page.$$eval('a[href*="/maps/place/"]', (links) => {
153 return [...new Set(links.map(link => link.href))];
154 });
155
156 log.info(`Found ${businessLinks.length} businesses for query: ${query}`);
157
158
159 const limitedLinks = businessLinks.slice(0, maxResults);
160
161
162 for (const businessUrl of limitedLinks) {
163 await crawler.addRequests([{
164 url: businessUrl,
165 userData: { label: 'DETAIL', query, businessUrl },
166 }]);
167 }
168 } else if (label === 'DETAIL') {
169 log.info(`Extracting business details from: ${businessUrl}`);
170
171
172 await page.waitForSelector('h1', { timeout: 15000 });
173 await page.waitForTimeout(3000);
174
175
176
177 const trySelectors = async (selectors) => {
178 for (const sel of selectors) {
179 try {
180 const el = await page.locator(sel).first();
181 if (await el.count() === 0) continue;
182 const txt = await el.textContent().catch(() => null);
183 if (txt && txt.trim().length > 0) return txt.trim();
184 } catch (e) {
185
186 }
187 }
188 return null;
189 };
190
191
192 const tryAttr = async (selectors, attr = 'href') => {
193 for (const sel of selectors) {
194 try {
195 const el = await page.locator(sel).first();
196 if (await el.count() === 0) continue;
197 const val = await el.getAttribute(attr).catch(() => null);
198 if (val) return val;
199 } catch (e) {
200
201 }
202 }
203 return null;
204 };
205
206
207 const name = (await trySelectors(['h1', 'h1 span', 'div[role="main"] h1'])) || null;
208
209
210 const category = await trySelectors([
211 'button[jsaction*="category"]',
212 'button[aria-label*="Category"]',
213 'div[role="main"] button:nth-of-type(1)',
214 ]);
215
216
217 let rating = null;
218 const ratingSelectors = [
219 'div[role="article"] span[aria-hidden="true"]',
220 'span[aria-label*="stars"]',
221 'span.ceNzKf',
222 'div.F7nice span[aria-hidden="true"]',
223 'span.MW4etd',
224 ];
225 for (const sel of ratingSelectors) {
226 try {
227 const txt = await page.locator(sel).first().textContent().catch(() => null);
228 if (txt) {
229
230 const m = txt.replace('\u200E', '').match(/(\d+[\.,]?\d*)/);
231 if (m) {
232 const v = parseFloat(m[1].replace(',', '.'));
233 if (!isNaN(v) && v >= 0 && v <= 5) { rating = v; break; }
234 }
235 }
236 } catch (e) {}
237 }
238
239
240 let reviewsCount = null;
241 const reviewsSelectors = [
242 'button[aria-label*="review"]',
243 'button[aria-label*="reviews"]',
244 'div.F7nice span:last-child',
245 'span.UY7F9',
246 'button[jsaction*="reviews"]',
247 ];
248 for (const sel of reviewsSelectors) {
249 try {
250 const el = await page.locator(sel).first();
251 if (await el.count() === 0) continue;
252 const txt = await el.textContent().catch(() => null);
253 if (txt) {
254 const m = txt.match(/([\d,\.\s]+)\s*(reviews|review)?/i) || txt.match(/\(?([\d,]+)\)?/);
255 if (m && m[1]) {
256 const num = parseInt(m[1].replace(/[^\d]/g, ''), 10);
257 if (!isNaN(num)) { reviewsCount = num; break; }
258 }
259 }
260 } catch (e) {}
261 }
262
263
264 const address = await trySelectors(['button[data-item-id="address"]', 'button[aria-label*="Address"]', 'button[aria-label*="address"]']);
265 const phone = await trySelectors(['button[data-item-id*="phone"]', 'button[aria-label*="Phone"]', 'a[href^="tel:"]']);
266 const website = await tryAttr(['a[data-item-id="authority"]', 'a[aria-label*="Website"]'], 'href');
267
268
269 let latitude = null, longitude = null;
270 try {
271 const url = page.url();
272 const coords = url.match(/@(-?\d+\.\d+),(-?\d+\.\d+)/);
273 if (coords) { latitude = parseFloat(coords[1]); longitude = parseFloat(coords[2]); }
274 } catch (e) {}
275
276
277 let hours = null;
278 try {
279
280 const hoursButton = page.locator('button[data-item-id*="hours"]').first();
281 if (await hoursButton.count() > 0) {
282 hours = await hoursButton.getAttribute('aria-label').catch(() => null);
283 }
284
285
286 if (!hours) {
287 hours = await page.locator('div[data-item-id*="hours"] .fontBodyMedium').first().textContent().catch(() => null);
288 }
289
290
291 if (!hours) {
292 hours = await page.locator('button[aria-label*="hours" i]').first().getAttribute('aria-label').catch(() => null);
293 }
294
295
296 if (!hours) {
297 const hoursPattern = await page.locator('div[role="region"] >> text=/\\d{1,2}.*[AP]M/i').first().textContent().catch(() => null);
298 if (hoursPattern) hours = hoursPattern;
299 }
300
301 log.debug(`Hours extracted: ${hours || 'none'}`);
302 } catch (e) {
303 log.warning(`Hours extraction error: ${e.message}`);
304 }
305
306
307 let images = [];
308 if (includeImages) {
309 try {
310 const imgHandles = await page.locator('button[aria-label*="Photo"] img, div[role="img"] img').elementHandles().catch(() => []);
311 for (const h of imgHandles.slice(0, 5)) {
312 try { const src = await h.getAttribute('src'); if (src && src.startsWith('http')) images.push(src); } catch (e){}
313 }
314 } catch (e) {}
315 }
316
317 const businessData = {
318 name: cleanText(name) || undefined,
319 category: cleanText(category) || undefined,
320 address: cleanText(address) || undefined,
321 phone: cleanText(phone) || undefined,
322 website: website || undefined,
323 rating: rating !== null ? rating : undefined,
324 reviewsCount: reviewsCount !== null ? reviewsCount : undefined,
325 latitude: latitude || undefined,
326 longitude: longitude || undefined,
327 hours: cleanHours(hours) || undefined,
328 images: images.length > 0 ? images : undefined,
329 };
330
331
332 if (includeReviews && businessData.reviewsCount > 0) {
333 try {
334
335 const reviewsButton = await page.$('button[aria-label*="Reviews"]');
336 if (reviewsButton) {
337 await reviewsButton.click();
338 await page.waitForTimeout(2000);
339
340
341 const reviews = await page.$$eval('div[data-review-id] span[lang]', (elements) => {
342 return elements.map(el => el.textContent.trim()).slice(0, 10);
343 });
344 businessData.reviews = reviews.map(r => cleanText(r)).filter(r => r);
345 }
346 } catch (err) {
347 log.warning(`Failed to extract reviews: ${err.message}`);
348 }
349 }
350
351
352 businessData.url = businessUrl;
353 businessData.searchQuery = query;
354 businessData.scrapedAt = new Date().toISOString();
355
356
357 if (!businessData.name || businessData.name.trim().length === 0) {
358 log.warning(`⚠️ Skipping business with no name: ${businessUrl}`);
359 return;
360 }
361
362
363 await Dataset.pushData(businessData);
364 log.info(`✅ Saved: ${businessData.name} | Rating: ${businessData.rating || 'N/A'} | Reviews: ${businessData.reviewsCount || 'N/A'}`);
365 }
366 },
367
368 async failedRequestHandler({ request, log }) {
369 log.error(`❌ Failed request: ${request.url} - Error: ${request.errorMessages?.join(', ') || 'Unknown error'}`);
370
371
372 await Actor.pushData({
373 error: true,
374 url: request.url,
375 query: request.userData.query,
376 errorMessage: request.errorMessages?.join(', ') || 'Request failed',
377 timestamp: new Date().toISOString(),
378 });
379 },
380});
381
382
383async function autoScroll(page, containerSelector, maxItems) {
384 const container = await page.$(containerSelector);
385 if (!container) return;
386
387 let previousHeight = 0;
388 let scrollAttempts = 0;
389 const maxScrollAttempts = Math.ceil(maxItems / 20);
390
391 while (scrollAttempts < maxScrollAttempts) {
392
393 await page.evaluate((sel) => {
394 const element = document.querySelector(sel);
395 if (element) {
396 element.scrollTo(0, element.scrollHeight);
397 }
398 }, containerSelector);
399
400 await page.waitForTimeout(2000);
401
402
403 const newHeight = await page.evaluate((sel) => {
404 const element = document.querySelector(sel);
405 return element ? element.scrollHeight : 0;
406 }, containerSelector);
407
408 if (newHeight === previousHeight) {
409 break;
410 }
411
412 previousHeight = newHeight;
413 scrollAttempts++;
414 }
415}
416
417
418await crawler.run(startUrls);
419
420
421const dataset = await Dataset.open();
422const { items } = await dataset.getData();
423const successfulItems = items.filter(item => !item.error);
424const failedItems = items.filter(item => item.error);
425const itemCount = successfulItems.length;
426
427
428const businessesWithPhone = successfulItems.filter(b => b.phone).length;
429const businessesWithWebsite = successfulItems.filter(b => b.website).length;
430const businessesWithReviews = successfulItems.filter(b => b.reviewsCount && b.reviewsCount > 0).length;
431const avgRating = successfulItems.filter(b => b.rating).reduce((sum, b) => sum + b.rating, 0) / successfulItems.filter(b => b.rating).length || 0;
432
433console.log(`\n${'='.repeat(70)}`);
434console.log(`✅ SCRAPING COMPLETED SUCCESSFULLY!`);
435console.log(`${'='.repeat(70)}`);
436console.log(`📊 RESULTS SUMMARY:`);
437console.log(` ├─ Total businesses extracted: ${itemCount}`);
438console.log(` ├─ Businesses with phone: ${businessesWithPhone} (${((businessesWithPhone/itemCount)*100).toFixed(1)}%)`);
439console.log(` ├─ Businesses with website: ${businessesWithWebsite} (${((businessesWithWebsite/itemCount)*100).toFixed(1)}%)`);
440console.log(` ├─ Businesses with reviews: ${businessesWithReviews} (${((businessesWithReviews/itemCount)*100).toFixed(1)}%)`);
441console.log(` ├─ Average rating: ${avgRating.toFixed(2)} ⭐`);
442console.log(` └─ Failed requests: ${failedItems.length}`);
443console.log(`\n🔍 QUERIES PROCESSED: ${validQueries.length}`);
444console.log(`💾 DATA EXPORT: Available in JSON, CSV, Excel formats`);
445console.log(`${'='.repeat(70)}\n`);
446
447
448const runStats = {
449 success: true,
450 summary: {
451 totalBusinesses: itemCount,
452 failedRequests: failedItems.length,
453 successRate: `${((itemCount / (itemCount + failedItems.length)) * 100).toFixed(1)}%`,
454 },
455 dataQuality: {
456 withPhone: businessesWithPhone,
457 withWebsite: businessesWithWebsite,
458 withReviews: businessesWithReviews,
459 phonePercentage: `${((businessesWithPhone/itemCount)*100).toFixed(1)}%`,
460 websitePercentage: `${((businessesWithWebsite/itemCount)*100).toFixed(1)}%`,
461 averageRating: parseFloat(avgRating.toFixed(2)),
462 },
463 searchQueries: validQueries,
464 configuration: {
465 maxResults,
466 includeReviews,
467 includeImages,
468 language,
469 maxConcurrency,
470 },
471 timestamp: new Date().toISOString(),
472 runtime: {
473 startTime: new Date(Date.now() - (await Actor.getValue('START_TIME') || Date.now())).toISOString(),
474 endTime: new Date().toISOString(),
475 },
476};
477
478await Actor.setValue('OUTPUT', runStats);
479
480
481await Actor.setStatusMessage(`✅ Extracted ${itemCount} businesses | Success rate: ${runStats.summary.successRate}`, {
482 isStatusMessageTerminal: true,
483});
484
485await Actor.exit();