1import asyncio
2import re
3import json
4from urllib.parse import quote_plus
5from apify import Actor
6from playwright.async_api import async_playwright, TimeoutError
7
8
9
10
11def is_valid_business_name(name):
12 if not name or len(name.strip()) < 1: return False
13 if name.strip().startswith('"') and name.strip().endswith('"'): return False
14 name_lower = name.strip().lower()
15 if len(name_lower) > 80: return False
16
17 if name.strip().isdigit(): return False
18
19 if name_lower in ['x', '+', '-', '·', '|', '/', '\\', '^', '*']: return False
20 if name_lower.startswith(('·', '$', '#', '@', 'http', 'sponsored')): return False
21
22 if re.match(r'^\d+\s+\w+\s+(st|street|ave|rd|dr|blvd)', name_lower, re.IGNORECASE): return False
23
24 if re.match(r'^\d+\.?$', name.strip()): return False
25
26 if re.match(r'^(open|closes?)\s*[⋅·]\s*(closes?|opens?)\s*\d+', name_lower): return False
27 skip_patterns = ['google maps', 'search results', 'filter by', 'sort by', 'reviews', 'directions', 'open now', 'closed']
28 if any(pattern in name_lower for pattern in skip_patterns): return False
29 return True
30
31def clean_business_name(name):
32 if not name: return ""
33 cleaned = name.strip().rstrip('&|\\').strip()
34 cleanup_patterns = [
35 r'\s*·.*$', r'^\s*\d+\.\s*', r'\s*\(\d+\)\s*$', r'\s*\d+\s*reviews?$',
36 r'\s*\$+.*$', r'\s*Open\s*(24\s*hours?|now).*$', r'\s*Closed.*$'
37 ]
38 for pattern in cleanup_patterns:
39 cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
40 return cleaned.strip()
41
42def extract_rating(lines):
43 for line in lines:
44 match = re.search(r'\b(\d\.\d)\b', line)
45 if match:
46 try:
47 rating = float(match.group(1))
48 if 1.0 <= rating <= 5.0:
49 return rating
50 except: continue
51 return None
52
53def extract_review_count(lines):
54 for line in lines:
55 match = re.search(r'\((\d{1,5})\)', line)
56 if match:
57 try: return int(match.group(1))
58 except: continue
59 return None
60
61def extract_category(lines):
62 """Extract the Google-provided business category"""
63
64
65
66
67
68
69 category_found = None
70
71 for i, line in enumerate(lines):
72 line = line.strip()
73
74
75 if not line or line.isdigit() or re.match(r'^\d\.\d$', line) or re.match(r'^\(\d+\)$', line):
76 continue
77
78
79 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now', 'temporarily closed']:
80 continue
81
82
83 if re.match(r'^\d+\s+\w+\s+(st|street|ave|avenue|rd|road|dr|drive)', line, re.IGNORECASE):
84 continue
85
86
87 if '·' in line:
88 parts = [p.strip() for p in line.split('·')]
89 for part in parts:
90
91 if re.match(r'^\$+$', part):
92 continue
93
94 if re.search(r'\d+.*?(st|street|ave|avenue|rd|road|dr|drive)', part, re.IGNORECASE):
95 continue
96
97 if re.match(r'^\d\.\d$', part):
98 continue
99
100 if not part or len(part) < 3:
101 continue
102
103
104
105 capital_count = sum(1 for c in part if c.isupper())
106 if capital_count <= len(part.split()) + 1:
107
108 if any(indicator in part.lower() for indicator in
109 ['restaurant', 'cafe', 'bar', 'shop', 'store', 'salon', 'spa',
110 'clinic', 'center', 'service', 'company', 'agency', 'studio',
111 'gym', 'fitness', 'market', 'pharmacy', 'hotel', 'motel',
112 'bakery', 'deli', 'pizza', 'sushi', 'thai', 'chinese', 'mexican',
113 'dental', 'medical', 'law', 'repair', 'auto', 'car']):
114 category_found = part
115 break
116
117
118 if not category_found and i < 5:
119
120 if len(line) < 50:
121
122 if any(indicator in line.lower() for indicator in
123 ['restaurant', 'cafe', 'bar', 'shop', 'store', 'salon', 'spa',
124 'clinic', 'center', 'service', 'company', 'agency', 'studio',
125 'gym', 'fitness', 'market', 'pharmacy', 'hotel', 'motel']):
126
127 if not re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]+){2,}', line):
128 category_found = line
129 break
130
131
132 return category_found if category_found else "Business"
133
134def extract_address(lines):
135 """Extract address from text lines"""
136
137 address_parts = []
138 found_address_start = False
139
140 for i, line in enumerate(lines):
141
142 if len(line) < 5:
143 continue
144
145
146 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now', 'temporarily closed', 'opens soon', 'onsite services']:
147 continue
148
149
150 if re.match(r'^(opens?|closes?)\s+', line, re.IGNORECASE):
151 continue
152
153
154 if line.strip().isdigit():
155 continue
156
157
158 if re.match(r'^[\d.]+$', line) or re.match(r'^\(\d+\)$', line):
159 continue
160
161
162
163 if '·' in line and not found_address_start:
164 parts = line.split('·')
165
166 for j, part in enumerate(parts):
167 part = part.strip()
168 if re.search(r'\d+.*?(?:st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|suite|ste|unit|apt|#)', part, re.IGNORECASE):
169 found_address_start = True
170
171 address_parts.extend([p.strip() for p in parts[j:] if p.strip()])
172 break
173 if found_address_start:
174 continue
175
176
177 has_street_pattern = re.search(r'\d+.*?(?:st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|suite|ste|unit|apt|#)', line, re.IGNORECASE)
178 has_state_zip = re.search(r'\b[A-Z]{2}\s+\d{5}\b', line)
179 has_po_box = re.search(r'(p\.?o\.?\s*box|pmb)\s*\d+', line, re.IGNORECASE)
180
181 if has_street_pattern or has_state_zip or has_po_box:
182
183 found_address_start = True
184
185 cleaned_line = line.split('·')[-1].strip()
186
187 cleaned_line = re.sub(r'^(Open|Closed|Hours|Directions).*?·\s*', '', cleaned_line, flags=re.IGNORECASE)
188
189 cleaned_line = re.sub(r',?\s*Onsite services\s*$', '', cleaned_line, flags=re.IGNORECASE)
190 address_parts.append(cleaned_line)
191 elif found_address_start and i < len(lines) - 1:
192
193 if re.search(r'(closes?|opens?|hours|open now|closed now|temporarily)', line, re.IGNORECASE):
194 break
195
196 if any(word in line.lower() for word in ['restaurant', 'cafe', 'shop', 'salon', 'clinic', 'service']):
197 if len(line) < 30:
198 continue
199
200 if re.search(r'(reviews?|rating|website|call|directions)', line, re.IGNORECASE):
201 continue
202
203 if 'onsite services' in line.lower():
204 continue
205
206 if re.search(r'\b[A-Z]{2}\b', line) or re.search(r'\b\d{5}\b', line) or (len(line) < 50 and not line[0].isdigit()):
207 address_parts.append(line.strip())
208 else:
209
210 break
211
212
213 if address_parts:
214
215 full_address = address_parts[0]
216 for part in address_parts[1:]:
217 if not full_address.endswith(',') and not part.startswith(','):
218 full_address += ', ' + part
219 else:
220 full_address += ' ' + part
221 return full_address.strip()
222
223 return ""
224
225def extract_place_ids(html_content: str) -> dict:
226 ids = {'place_id': None, 'cid': None, 'website': None}
227 place_id_match = re.search(r'(ChIJ[A-Za-z0-9_-]+)', html_content)
228 if place_id_match:
229 ids['place_id'] = place_id_match.group(1)
230 cid_match = re.search(r'(0x[a-f0-9:]+)', html_content)
231 if cid_match:
232 ids['cid'] = cid_match.group(1)
233 return ids
234
235def extract_price_range(lines):
236 """Extract price range indicators like $, $$, $$$"""
237 for line in lines:
238
239 if re.match(r'^\$+$', line.strip()):
240 return line.strip()
241
242 if '·' in line and re.search(r'\$+', line):
243 parts = line.split('·')
244 for part in parts:
245 if re.match(r'^\s*\$+\s*$', part):
246 return part.strip()
247 return ""
248
249def extract_website_from_page(lines):
250 """Extract website URL from detail panel"""
251 for i, line in enumerate(lines):
252
253 if len(line) > 200:
254 continue
255
256
257 if re.match(r'^https?://', line):
258
259 if any(domain in line for domain in ['google.com', 'maps.google', 'gstatic.com', 'googleapis.com']):
260 continue
261 return line.strip()
262
263
264 domain_match = re.match(r'^(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)$', line)
265 if domain_match:
266 return f"https://{line.strip()}"
267
268 return ""
269
270def extract_booking_url(lines):
271 """Extract booking/appointment URL"""
272 booking_keywords = ['book', 'appointment', 'schedule', 'reserve', 'booking']
273
274 for i, line in enumerate(lines):
275 line_lower = line.lower()
276
277
278 if any(keyword in line_lower for keyword in booking_keywords):
279
280 for j in range(i, min(i + 3, len(lines))):
281 if re.match(r'^https?://', lines[j]):
282 return lines[j].strip()
283
284 return ""
285
286def extract_services_offered(lines):
287 """Extract services from business listing"""
288 services = []
289 services_section = False
290
291 for line in lines:
292 line_lower = line.lower()
293
294
295 if 'services' in line_lower or 'offerings' in line_lower:
296 services_section = True
297 continue
298
299 if services_section:
300
301 if any(keyword in line_lower for keyword in ['hours', 'reviews', 'photos', 'about']):
302 break
303
304
305 if line.strip() and len(line) > 3 and not line.isdigit():
306 services.append(line.strip())
307
308
309 if len(services) > 10:
310 break
311
312 return services
313
314def extract_attributes(lines):
315 """Extract business attributes/amenities"""
316 attributes = []
317 attribute_keywords = ['wheelchair', 'parking', 'delivery', 'takeout', 'dine-in',
318 'wifi', 'accepts', 'outdoor', 'seating', 'drive-through',
319 'curbside', 'restroom', 'masks required', 'appointment']
320
321 for line in lines[:100]:
322 line_lower = line.lower()
323
324 for keyword in attribute_keywords:
325 if keyword in line_lower and len(line) < 100:
326 attributes.append(line.strip())
327 break
328
329 return list(set(attributes))
330
331def extract_highlights(lines):
332 """Extract business highlights"""
333 highlights = []
334 highlight_keywords = ['popular for', 'known for', 'best', 'famous', 'specialty']
335
336 for line in lines:
337 line_lower = line.lower()
338
339 if any(keyword in line_lower for keyword in highlight_keywords):
340 highlights.append(line.strip())
341
342 if len(highlights) >= 3:
343 break
344
345 return highlights
346
347def extract_photo_count(lines):
348 """Extract number of photos"""
349 for line in lines:
350
351 photo_match = re.search(r'(\d+)\s*photos?', line, re.IGNORECASE)
352 if photo_match:
353 try:
354 return int(photo_match.group(1))
355 except:
356 pass
357 return 0
358
359def extract_hours(lines):
360 """Extract business hours"""
361 hours = {}
362 days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
363
364 for i, line in enumerate(lines):
365 line_lower = line.lower()
366
367
368 for day in days:
369 if day in line_lower:
370
371 time_pattern = r'\d{1,2}(?::\d{2})?\s*[ap]\.?m\.?\s*[–-]\s*\d{1,2}(?::\d{2})?\s*[ap]\.?m\.?'
372
373 time_match = re.search(time_pattern, line, re.IGNORECASE)
374 if time_match:
375 hours[day.capitalize()] = time_match.group()
376 elif i + 1 < len(lines):
377
378 time_match = re.search(time_pattern, lines[i + 1], re.IGNORECASE)
379 if time_match:
380 hours[day.capitalize()] = time_match.group()
381
382
383 if 'closed' in line_lower:
384 hours[day.capitalize()] = 'Closed'
385
386 break
387
388 return hours
389
390def parse_address_components(address):
391 """Extract city and state from address string"""
392 if not address:
393 return {'city': '', 'state': ''}
394
395 components = {'city': '', 'state': ''}
396 original_address = address
397
398
399 address = re.sub(r',?\s*Onsite services\s*,?', '', address, flags=re.IGNORECASE)
400
401
402 known_wa_cities = ['Seattle', 'Bellevue', 'Kirkland', 'Redmond', 'Renton', 'Newcastle',
403 'Issaquah', 'Sammamish', 'Bothell', 'Kenmore', 'Woodinville', 'Shoreline',
404 'Lake Forest Park', 'Mercer Island', 'Tukwila', 'Burien', 'SeaTac']
405 known_va_cities = ['Ashburn', 'Sterling', 'Leesburg', 'Herndon', 'Reston', 'Chantilly', 'Fairfax']
406 known_ca_cities = ['Los Angeles', 'Santa Monica', 'Beverly Hills', 'Pasadena', 'Whittier',
407 'Encino', 'Burbank', 'Glendale', 'Hollywood', 'Sherman Oaks']
408
409 all_known_cities = known_wa_cities + known_va_cities + known_ca_cities
410
411
412
413
414 state_pattern = r'\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC)\b'
415
416
417 state_matches = []
418 for match in re.finditer(state_pattern, address, re.IGNORECASE):
419
420 if match.group(1).upper() == 'NE':
421
422 before_text = address[:match.start()].strip()
423 if re.search(r'(Ave|Avenue|St|Street|Rd|Road|Dr|Drive|Way|Pl|Place|Blvd|Boulevard)\s*$', before_text, re.IGNORECASE):
424 continue
425 state_matches.append(match)
426
427 if state_matches:
428
429 state_match = state_matches[-1]
430 components['state'] = state_match.group(1).upper()
431
432 address = address[:state_match.start()] + address[state_match.end():]
433 address = address.strip().rstrip(',').strip()
434
435
436 city_found = False
437 for known_city in all_known_cities:
438 if known_city.lower() in original_address.lower():
439 components['city'] = known_city
440 city_found = True
441 break
442
443
444 if not city_found:
445
446 if re.search(r'\b\d+\s+\d+\w*\s+(?:Ave|Avenue|St|Street|Way|Pl|Place)\s+(?:NE|NW|SE|SW)\b', original_address, re.IGNORECASE):
447
448 if components['state'] == 'WA' or 'WA' in original_address:
449
450 components['city'] = 'Seattle'
451 elif re.search(r'(Kirkland|Bellevue|Redmond)', original_address, re.IGNORECASE):
452
453 for city in ['Kirkland', 'Bellevue', 'Redmond']:
454 if city.lower() in original_address.lower():
455 components['city'] = city
456 break
457
458
459 if not components['city']:
460
461
462 parts = original_address.split(',')
463 for part in reversed(parts):
464 part = part.strip()
465
466
467 if not part or len(part) < 3:
468 continue
469
470
471 if part.lower() in ['onsite services', 'online', 'virtual']:
472 continue
473
474
475 if re.match(r'^\d+\s+', part):
476 continue
477
478
479 if re.search(r'\b(suite|ste|unit|apt|building|bldg|floor|fl|#)\s*[\w\d-]*\b', part, re.IGNORECASE):
480 continue
481
482
483 if re.search(r'\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|plaza|plz|cir|circle|ter|terrace|commons|cmns)\s*(?:#?\d+)?$', part, re.IGNORECASE):
484 continue
485
486
487 if re.match(r'^[A-Z]{2}\s+\d{5}(?:-\d{4})?$', part):
488 continue
489
490
491 if part and not re.match(r'^\d', part):
492 components['city'] = part
493 break
494
495
496 if components['state'] and not components['city']:
497 if components['state'] == 'WA':
498
499 if 'Green Lake' in original_address or 'Greenlake' in original_address:
500 components['city'] = 'Seattle'
501 elif any(indicator in original_address for indicator in ['120th', '122nd', '116th', '130th', '148th', '164th']):
502
503 if 'NE' in original_address:
504 components['city'] = 'Bellevue'
505 elif components['state'] == 'VA' and any(city in original_address for city in known_va_cities):
506 for city in known_va_cities:
507 if city.lower() in original_address.lower():
508 components['city'] = city
509 break
510
511 return components
512
513def parse_location_type(location_str):
514 """Determine if the location is an address, Place ID, or coordinates"""
515 location_str = location_str.strip()
516
517
518 if location_str.startswith('ChIJ'):
519 return 'place_id', location_str
520
521
522
523 normalized = location_str.replace(' ', '')
524 coord_pattern = r'^-?\d+\.?\d*,-?\d+\.?\d*$'
525 if re.match(coord_pattern, normalized):
526 parts = normalized.split(',')
527 try:
528 lat = float(parts[0])
529 lng = float(parts[1])
530 return 'coordinates', (lat, lng)
531 except:
532 pass
533
534
535 return 'address', location_str
536
537
538
539
540async def geocode_with_address(page, address: str) -> dict:
541 Actor.log.info(f"Geocoding address: {address}...")
542 query = quote_plus(address)
543 maps_url = f"https://www.google.com/maps/search/{query}"
544 await page.goto(maps_url, wait_until='load', timeout=25000)
545 await asyncio.sleep(5)
546 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
547 if not coord_match:
548 return {'found': False, 'error': 'Could not find coordinates from address'}
549 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
550 html_content = await page.content()
551 ids = extract_place_ids(html_content)
552 return {'found': True, 'latitude': lat, 'longitude': lng, **ids}
553
554async def geocode_with_place_id(page, place_id: str) -> dict:
555 Actor.log.info(f"Geocoding Place ID: {place_id}...")
556 maps_url = f"https://www.google.com/maps/search/?api=1&query=some_text&query_place_id={place_id}"
557 await page.goto(maps_url, wait_until='load', timeout=25000)
558 await asyncio.sleep(5)
559 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
560 if not coord_match: return {'found': False, 'error': 'Could not find coordinates from Place ID'}
561 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
562 html_content = await page.content()
563 ids = extract_place_ids(html_content)
564 return {'found': True, 'latitude': lat, 'longitude': lng, 'place_id': place_id, **ids}
565
566async def scrape_competitors_two_pass(playwright_instance, source_company_name: str, lat: float, lon: float,
567 search_query: str, max_competitors: int, search_location: str = None,
568 verbose: bool = False):
569 """Two-pass scraping approach: Pass 1 at 25% zoom, Pass 2 at 50% zoom for details"""
570
571 Actor.log.info(f"Starting two-pass scraping at {lat},{lon} for '{search_query}'...")
572 query = quote_plus(search_query)
573 zoom_level = 11
574 url = f"https://www.google.com/maps/search/{query}/@{lat},{lon},{zoom_level}z"
575
576
577 Actor.log.info("PASS 1: Discovering competitors at 25% zoom...")
578
579 browser1 = await playwright_instance.chromium.launch(headless=False)
580 context1 = await browser1.new_context(
581 viewport={'width': 2560, 'height': 1440},
582 user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
583 )
584 page1 = await context1.new_page()
585
586 competitors = []
587
588 try:
589 await page1.goto(url, wait_until='domcontentloaded', timeout=45000)
590
591
592 await page1.evaluate('document.body.style.zoom = "0.25"')
593 await asyncio.sleep(3)
594
595
596 Actor.log.info("Triggering content loading...")
597 await page1.mouse.move(960, 540)
598 await page1.mouse.wheel(0, 500)
599 await asyncio.sleep(2)
600 await page1.mouse.wheel(0, -500)
601 await asyncio.sleep(3)
602
603
604 selectors = ['[role="article"]', '.Nv2PK.THOPZb.CpccDe', '.lI9IFe', '.hfpxzc']
605 elements = []
606 for selector in selectors:
607 elements = await page1.query_selector_all(selector)
608 if elements and len(elements) > 5:
609 Actor.log.info(f"Found {len(elements)} results with selector: {selector}")
610 break
611
612 if not elements:
613 Actor.log.error("No elements found in Pass 1")
614 return []
615
616 await asyncio.sleep(5)
617
618
619 Actor.log.info(f"Extracting basic info from {len(elements)} elements...")
620 cleaned_source_name = re.sub(r'[^a-z0-9]', '', source_company_name.lower()) if source_company_name else ""
621
622 for i, element in enumerate(elements[:max_competitors + 5]):
623 if len(competitors) >= max_competitors:
624 break
625
626 try:
627 text = await element.inner_text()
628 if not text or len(text.strip()) < 5:
629 continue
630
631 lines = [line.strip() for line in text.split('\n') if line.strip()]
632
633
634 business_name = None
635 for line in lines[:10]:
636 if line.strip().isdigit() or re.match(r'^\d+\.$', line):
637 continue
638 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now']:
639 continue
640 if re.match(r'^\d\.\d$', line) or re.match(r'^\(\d+\)$', line):
641 continue
642
643 if is_valid_business_name(line):
644 business_name = clean_business_name(line)
645 if business_name:
646 break
647
648 if not business_name:
649 continue
650
651
652 if any(c['business_name'] == business_name for c in competitors):
653 continue
654
655
656 is_source = False
657 if cleaned_source_name:
658 cleaned_competitor_name = re.sub(r'[^a-z0-9]', '', business_name.lower())
659 is_source = cleaned_source_name in cleaned_competitor_name or cleaned_competitor_name in cleaned_source_name
660
661
662 rating = extract_rating(lines)
663 review_count = extract_review_count(lines)
664 address = extract_address(lines)
665 category = extract_category(lines)
666 price_range = extract_price_range(lines)
667
668 competitor = {
669 'business_name': business_name,
670 'google_rank': len(competitors) + 1,
671 'is_source_business': is_source,
672 'average_rating': rating,
673 'number_of_reviews': review_count,
674 'main_category': category,
675 'address': address,
676 'city': '',
677 'state': '',
678 'website': '',
679 'phone': '',
680 'place_id': '',
681 'price_range': price_range,
682 'services_offered': [],
683 'attributes': [],
684 'highlights': [],
685 'photo_count': 0,
686 'hours': {},
687 'booking_url': ''
688 }
689
690 competitors.append(competitor)
691 Actor.log.info(f"#{len(competitors)}: {business_name} - {category}")
692
693 except Exception as e:
694 continue
695
696 Actor.log.info(f"PASS 1 COMPLETE: Found {len(competitors)} competitors")
697
698 except Exception as e:
699 Actor.log.error(f"Error in Pass 1: {e}")
700
701 finally:
702 Actor.log.info("Closing Pass 1 browser...")
703 await browser1.close()
704
705
706 if not competitors:
707 return []
708
709 Actor.log.info("PASS 2: Getting detailed information at 50% zoom...")
710
711 browser2 = await playwright_instance.chromium.launch(headless=False)
712 context2 = await browser2.new_context(
713 viewport={'width': 1920, 'height': 1080},
714 user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
715 )
716 page2 = await context2.new_page()
717
718 try:
719 await page2.goto(url, wait_until='domcontentloaded', timeout=45000)
720
721
722 await page2.evaluate('document.body.style.zoom = "0.5"')
723 await asyncio.sleep(5)
724
725
726 await page2.mouse.move(500, 400)
727 await page2.mouse.wheel(0, 300)
728 await asyncio.sleep(2)
729 await page2.mouse.wheel(0, -300)
730 await asyncio.sleep(3)
731
732 Actor.log.info("Starting to click on businesses for details...")
733
734
735 successful_extractions = 0
736 last_clicked_name = None
737
738 for idx, competitor in enumerate(competitors[:max_competitors]):
739 try:
740 Actor.log.info(f"Getting details for #{idx + 1}: {competitor['business_name']}")
741
742
743 try:
744 await page2.keyboard.press('Escape')
745 await asyncio.sleep(0.5)
746 except:
747 pass
748
749
750 elements = await page2.query_selector_all('[role="article"]')
751 if not elements:
752 elements = await page2.query_selector_all('.Nv2PK.THOPZb.CpccDe')
753
754 clicked = False
755 for element in elements:
756 try:
757
758 is_visible = await element.is_visible()
759 if not is_visible:
760 continue
761
762 text = await element.inner_text()
763
764
765 if last_clicked_name and last_clicked_name.lower() in text.lower():
766 continue
767
768
769 competitor_name_parts = [part for part in competitor['business_name'].lower().split() if len(part) > 2]
770 element_text_lower = text.lower()
771
772 matches = sum(1 for part in competitor_name_parts if part in element_text_lower)
773
774 if matches >= max(1, len(competitor_name_parts) * 0.6):
775
776 last_clicked_name = competitor['business_name']
777
778 await element.click()
779 clicked = True
780
781
782 await asyncio.sleep(1.5)
783
784
785
786 detail_panel = None
787 try:
788
789 panel_selectors = [
790 '[role="main"]',
791 '.m6QErb.DxyBCb',
792 '.xaLxOe',
793 'div[jslog*="panel"]'
794 ]
795
796 for selector in panel_selectors:
797 panels = await page2.query_selector_all(selector)
798 if panels:
799
800 detail_panel = panels[-1]
801 break
802 except:
803 pass
804
805
806 if detail_panel:
807 panel_text = await detail_panel.inner_text()
808 lines = panel_text.split('\n')
809 else:
810
811 full_page_text = await page2.inner_text('body')
812 lines = full_page_text.split('\n')
813
814
815 website = extract_website_from_page(lines)
816 if website:
817
818 if idx > 0 and competitors[idx-1].get('website') == website:
819 Actor.log.info(f" Detected duplicate website {website}, skipping")
820 else:
821 competitor['website'] = website
822 successful_extractions += 1
823 Actor.log.info(f" Website: {website}")
824
825
826 booking_url = extract_booking_url(lines)
827 if booking_url and booking_url != competitor.get('website'):
828 competitor['booking_url'] = booking_url
829 Actor.log.info(f" Booking URL: {booking_url}")
830
831
832 for line in lines[:50]:
833 phone_match = re.search(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', line)
834 if phone_match:
835 competitor['phone'] = phone_match.group()
836 Actor.log.info(f" Phone: {competitor['phone']}")
837 break
838
839
840 services = extract_services_offered(lines)
841 if services:
842 competitor['services_offered'] = services
843 Actor.log.info(f" Services found: {len(services)}")
844
845 attributes = extract_attributes(lines)
846 if attributes:
847 competitor['attributes'] = attributes
848 Actor.log.info(f" Attributes: {', '.join(attributes[:3])}...")
849
850 highlights = extract_highlights(lines)
851 if highlights:
852 competitor['highlights'] = highlights
853
854 photo_count = extract_photo_count(lines)
855 if photo_count:
856 competitor['photo_count'] = photo_count
857 Actor.log.info(f" Photos: {photo_count}")
858
859 hours = extract_hours(lines)
860 if hours:
861 competitor['hours'] = hours
862 Actor.log.info(f" Hours: Found for {len(hours)} days")
863
864
865 page_html = await page2.content()
866 place_id_match = re.search(r'(ChIJ[A-Za-z0-9_-]+)', page_html)
867 if place_id_match:
868 competitor['place_id'] = place_id_match.group(1)
869
870
871 await page2.keyboard.press('Escape')
872 await asyncio.sleep(1)
873 break
874
875 except Exception as e:
876 continue
877
878 if not clicked:
879 Actor.log.info(f" Could not find {competitor['business_name']} in view")
880
881 try:
882 await page2.evaluate('document.querySelector("[role=\'feed\']").scrollTop += 500')
883 await asyncio.sleep(1.5)
884 except:
885 pass
886
887 except Exception as e:
888 Actor.log.error(f" Error: {str(e)}")
889
890 try:
891 await page2.keyboard.press('Escape')
892 await asyncio.sleep(0.5)
893 except:
894 pass
895 continue
896
897 Actor.log.info(f"PASS 2 COMPLETE: Extracted details for {successful_extractions} businesses")
898
899
900 for competitor in competitors:
901 if competitor['address']:
902 components = parse_address_components(competitor['address'])
903 competitor['city'] = components['city']
904 competitor['state'] = components['state']
905
906
907 if search_location and (not competitor['city'] or not competitor['state']):
908 search_components = parse_address_components(search_location)
909 if not competitor['city'] and search_components['city']:
910 competitor['city'] = search_components['city']
911 if not competitor['state'] and search_components['state']:
912 competitor['state'] = search_components['state']
913
914 except Exception as e:
915 Actor.log.error(f"Error in Pass 2: {e}")
916
917 finally:
918 Actor.log.info("Closing Pass 2 browser...")
919 await browser2.close()
920
921 return competitors
922
923
924async def main() -> None:
925 async with Actor:
926 actor_input = await Actor.get_input() or {}
927
928
929 input_method = actor_input.get('inputMethod', 'structured')
930 search_query = actor_input.get('searchQuery', '')
931
932
933 if not search_query:
934 Actor.log.error("No search query provided. Please specify what type of business to search for.")
935 await Actor.push_data({
936 'error': 'No search query provided',
937 'message': 'Please provide a search query (e.g., "restaurant", "auto repair", "dentist")'
938 })
939 return
940
941 max_competitors = actor_input.get('maxCompetitors', 30)
942 proxy_configuration = actor_input.get('proxyConfiguration')
943 verbose = actor_input.get('verbose', False)
944
945
946 locations = []
947 business_names = []
948
949 if input_method == 'structured':
950
951 city = actor_input.get('city', '').strip()
952 state = actor_input.get('state', '').strip()
953 zip_code = actor_input.get('zipCode', '').strip()
954 business_name = actor_input.get('businessName', '').strip()
955
956
957 location_parts = []
958 if city:
959 location_parts.append(city)
960 if state:
961 location_parts.append(state)
962 if zip_code:
963 location_parts.append(zip_code)
964
965 if location_parts:
966 location_str = ', '.join(location_parts)
967 locations.append(location_str)
968 business_names.append(business_name)
969 Actor.log.info(f"Structured input: {location_str} | Business: {business_name or 'None'}")
970
971 elif input_method == 'bulk':
972
973 bulk_text = actor_input.get('bulkLocations', '')
974 if bulk_text:
975 lines = [line.strip() for line in bulk_text.strip().split('\n') if line.strip()]
976 for line in lines:
977
978 if '|' in line:
979 location, business = line.split('|', 1)
980 locations.append(location.strip())
981 business_names.append(business.strip())
982 else:
983 locations.append(line)
984 business_names.append('')
985 Actor.log.info(f"Bulk input: {len(locations)} locations parsed")
986
987
988 locations_data = actor_input.get('locations', [])
989 if locations_data and not locations:
990 for item in locations_data:
991 if isinstance(item, dict):
992 locations.append(item.get('key', ''))
993 business_names.append(item.get('value', ''))
994 else:
995 locations.append(str(item))
996 business_names.append('')
997 Actor.log.info(f"Legacy input: {len(locations)} locations from array")
998
999
1000 Actor.log.info(f"Total locations to process: {len(locations)}")
1001 Actor.log.info(f"Search query: '{search_query}'")
1002 Actor.log.info(f"Max competitors: {max_competitors}")
1003 if verbose:
1004 Actor.log.info(f"Verbose mode: ENABLED")
1005
1006 if not locations:
1007 Actor.log.error("No locations provided. Please add at least one location.")
1008 await Actor.push_data({
1009 'error': 'No locations provided',
1010 'message': 'Please provide at least one location (address, Place ID, or coordinates)'
1011 })
1012 return
1013
1014 Actor.log.info(f"Processing {len(locations)} locations...")
1015 Actor.log.info(f"Search query: '{search_query}'")
1016 Actor.log.info(f"Max competitors: {max_competitors}")
1017
1018
1019 total_results = 0
1020
1021
1022 async with async_playwright() as p:
1023
1024 browser_options = {"headless": True}
1025
1026 if proxy_configuration and proxy_configuration.get('useApifyProxy'):
1027 try:
1028
1029
1030 Actor.log.info("Proxy requested but implementation needs to be verified for your Apify SDK version")
1031 except Exception as e:
1032 Actor.log.warning(f"Could not set up proxy: {e}")
1033
1034 Actor.log.info("Launching geocoding browser...")
1035 browser = await p.chromium.launch(**browser_options)
1036 context = await browser.new_context(
1037 viewport={'width': 1920, 'height': 1080},
1038 user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
1039 )
1040 page = await context.new_page()
1041
1042
1043 for idx, location_str in enumerate(locations):
1044
1045 business_name = business_names[idx] if idx < len(business_names) else None
1046
1047
1048 if not location_str or not location_str.strip():
1049 continue
1050
1051
1052 location_type, location_data = parse_location_type(location_str)
1053
1054 Actor.log.info(f"\nProcessing location {idx + 1}/{len(locations)}: {location_str}")
1055 if business_name:
1056 Actor.log.info(f"Business name: {business_name}")
1057
1058 geo = {'found': False}
1059
1060 try:
1061 if location_type == 'coordinates':
1062 lat, lng = location_data
1063 Actor.log.info(f"Using provided coordinates: {lat}, {lng}")
1064 geo = {'found': True, 'latitude': lat, 'longitude': lng}
1065 elif location_type == 'place_id':
1066 geo = await geocode_with_place_id(page, location_data)
1067 else:
1068 geo = await geocode_with_address(page, location_data)
1069 except Exception as e:
1070 Actor.log.exception(f"Geocoding failed: {e}")
1071 geo = {'found': False, 'error': str(e)}
1072
1073
1074 competitors_list = []
1075 if geo.get('found'):
1076 try:
1077
1078 competitors_list = await scrape_competitors_two_pass(
1079 p,
1080 business_name,
1081 geo['latitude'],
1082 geo['longitude'],
1083 search_query,
1084 max_competitors,
1085 location_str,
1086 verbose
1087 )
1088 Actor.log.info(f"Found {len(competitors_list)} competitors for {business_name or location_str}")
1089 except Exception as e:
1090 Actor.log.exception(f"Competitor scraping failed: {e}")
1091 else:
1092 Actor.log.warning(f"Skipping competitor search - geocoding failed: {geo.get('error', 'Unknown error')}")
1093
1094
1095 if competitors_list:
1096
1097 for competitor in competitors_list:
1098 result = {
1099
1100 'competitor_name': str(competitor['business_name']),
1101 'competitor_address': str(competitor['address'] or ''),
1102 'competitor_city': str(competitor['city'] or ''),
1103 'competitor_state': str(competitor['state'] or ''),
1104 'competitor_category': str(competitor['main_category']),
1105 'competitor_rank': int(competitor['google_rank']),
1106 'competitor_rating': float(competitor['average_rating']) if competitor['average_rating'] is not None else 0.0,
1107 'competitor_reviews': int(competitor['number_of_reviews']) if competitor['number_of_reviews'] is not None else 0,
1108 'competitor_place_id': str(competitor['place_id'] or ''),
1109 'latitude': float(geo.get('latitude')) if geo.get('latitude') is not None else 0.0,
1110 'longitude': float(geo.get('longitude')) if geo.get('longitude') is not None else 0.0,
1111 'search_query': str(search_query),
1112 'competitor_website': str(competitor.get('website', '')),
1113 'competitor_phone': str(competitor.get('phone', '')),
1114
1115 'is_source_business': bool(competitor['is_source_business']),
1116 'input_location': str(location_str),
1117 'input_business_name': str(business_name or '')
1118 }
1119
1120 if verbose:
1121 Actor.log.info(f"Pushing result: {json.dumps(result, indent=2)}")
1122 await Actor.push_data(result)
1123 total_results += 1
1124 else:
1125
1126 result = {
1127
1128 'competitor_name': '',
1129 'competitor_address': '',
1130 'competitor_city': '',
1131 'competitor_state': '',
1132 'competitor_category': '',
1133 'competitor_rank': 0,
1134 'competitor_rating': 0.0,
1135 'competitor_reviews': 0,
1136 'competitor_place_id': '',
1137 'latitude': float(geo.get('latitude')) if geo.get('latitude') is not None else 0.0,
1138 'longitude': float(geo.get('longitude')) if geo.get('longitude') is not None else 0.0,
1139 'search_query': str(search_query),
1140 'competitor_website': '',
1141 'competitor_phone': '',
1142
1143 'is_source_business': False,
1144 'input_location': str(location_str),
1145 'input_business_name': str(business_name or '')
1146 }
1147 await Actor.push_data(result)
1148 total_results += 1
1149
1150 await browser.close()
1151 Actor.log.info(f"\nScraping completed! Processed {len(locations)} locations and found {total_results} total results.")
1152
1153
1154if __name__ == "__main__":
1155 asyncio.run(main())