1import asyncio
2import re
3from urllib.parse import quote_plus
4from apify import Actor
5from playwright.async_api import async_playwright, TimeoutError
6
7
8
9
10def is_valid_business_name(name):
11 if not name or len(name.strip()) < 1: return False
12 if name.strip().startswith('"') and name.strip().endswith('"'): return False
13 name_lower = name.strip().lower()
14 if len(name_lower) > 80: return False
15
16
17 if name_lower in ['x', '+', '-', '·', '|', '/', '\\', '^', '*']: return False
18 if name_lower.startswith(('·', '$', '#', '@', 'http', 'sponsored')): return False
19
20 if re.match(r'^\d+\s+\w+\s+(st|street|ave|rd|dr|blvd)', name_lower, re.IGNORECASE): return False
21 skip_patterns = ['google maps', 'search results', 'filter by', 'sort by', 'reviews', 'directions', 'open now', 'closed']
22 if any(pattern in name_lower for pattern in skip_patterns): return False
23 return True
24
25def clean_business_name(name):
26 if not name: return ""
27 cleaned = name.strip().rstrip('&|\\').strip()
28 cleanup_patterns = [
29 r'\s*·.*$', r'^\s*\d+\.\s*', r'\s*\(\d+\)\s*$', r'\s*\d+\s*reviews?$',
30 r'\s*\$+.*$', r'\s*Open\s*(24\s*hours?|now).*$', r'\s*Closed.*$', r'\s*Medical spa.*$'
31 ]
32 for pattern in cleanup_patterns:
33 cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
34 return cleaned.strip()
35
36def extract_rating(lines):
37 for line in lines:
38 match = re.search(r'\b(\d\.\d)\b', line)
39 if match:
40 try: return float(match.group(1))
41 except: continue
42 return None
43
44def extract_review_count(lines):
45 for line in lines:
46 match = re.search(r'\((\d{1,5})\)', line)
47 if match:
48 try: return int(match.group(1))
49 except: continue
50 return None
51
52def extract_address(lines):
53
54 address_parts = []
55 found_address_start = False
56
57 for i, line in enumerate(lines):
58
59 if len(line) < 5:
60 continue
61
62
63 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now', 'temporarily closed']:
64 continue
65
66
67 if re.match(r'^[\d.]+$', line) or re.match(r'^\(\d+\)$', line):
68 continue
69
70
71 has_street_pattern = re.search(r'\d+.*?(?:st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|suite|ste|unit|apt|#)', line, re.IGNORECASE)
72 has_state_zip = re.search(r'\b[A-Z]{2}\s+\d{5}\b', line)
73 has_po_box = re.search(r'(p\.?o\.?\s*box|pmb)\s*\d+', line, re.IGNORECASE)
74
75 if has_street_pattern or has_state_zip or has_po_box:
76
77 found_address_start = True
78
79 cleaned_line = line.split('·')[-1].strip()
80
81 cleaned_line = re.sub(r'^(Open|Closed|Hours|Directions).*?·\s*', '', cleaned_line, flags=re.IGNORECASE)
82 address_parts.append(cleaned_line)
83 elif found_address_start and i < len(lines) - 1:
84
85
86 if not re.search(r'(medical spa|wellness center|aesthetic|dermatology|clinic|reviews?|rating|open|closed|hours|directions|website|call)', line, re.IGNORECASE):
87
88 if re.search(r'\b[A-Z]{2}\b', line) or re.search(r'\b\d{5}\b', line) or len(line) < 50:
89 address_parts.append(line.strip())
90 else:
91
92 break
93
94
95 if address_parts:
96
97 full_address = address_parts[0]
98 for part in address_parts[1:]:
99 if not full_address.endswith(',') and not part.startswith(','):
100 full_address += ', ' + part
101 else:
102 full_address += ' ' + part
103 return full_address.strip()
104
105 return ""
106
107def determine_category(lines):
108 text = ' '.join(lines).lower()
109 if 'dermatology' in text: return "Dermatology clinic"
110 if any(x in text for x in ['wellness', 'wellness center']): return "Wellness center"
111 if any(x in text for x in ['aesthetic', 'aesthetics']): return "Aesthetic clinic"
112 return "Medical spa"
113
114def extract_place_ids(html_content: str) -> dict:
115 ids = {'place_id': None, 'cid': None}
116 place_id_match = re.search(r'(ChIJ[A-Za-z0-9_-]+)', html_content)
117 if place_id_match:
118 ids['place_id'] = place_id_match.group(1)
119 cid_match = re.search(r'(0x[a-f0-9:]+)', html_content)
120 if cid_match:
121 ids['cid'] = cid_match.group(1)
122 return ids
123
124def parse_address_components(address):
125 """Extract city, state, and zipcode from address string"""
126 if not address:
127 return {'city': '', 'state': '', 'zipcode': ''}
128
129 components = {'city': '', 'state': '', 'zipcode': ''}
130 original_address = address
131
132
133
134 zip_patterns = [
135 r',\s*(\d{5}(?:-\d{4})?)\s*$',
136 r'\s+(\d{5}(?:-\d{4})?)\s*$',
137 r'\b(?<!^\d)(\d{5}(?:-\d{4})?)\b(?!.*\d{5})'
138 ]
139
140 for pattern in zip_patterns:
141 zip_match = re.search(pattern, address)
142 if zip_match:
143 components['zipcode'] = zip_match.group(1)
144 address = address.replace(zip_match.group(0), '').strip().rstrip(',')
145 break
146
147
148 state_pattern = r'\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC)\b'
149
150
151 state_matches = list(re.finditer(state_pattern, address, re.IGNORECASE))
152 if state_matches:
153
154 state_match = state_matches[-1]
155 components['state'] = state_match.group(1).upper()
156
157 address = address[:state_match.start()] + address[state_match.end():]
158 address = address.strip().rstrip(',').strip()
159
160
161
162 parts = address.split(',')
163
164
165 filtered_parts = []
166 for part in parts:
167 part = part.strip()
168
169 if not part:
170 continue
171
172
173 if re.match(r'^\d+\s+\w+', part):
174 continue
175
176
177 if re.search(r'\b(suite|ste|unit|apt|building|bldg|floor|fl|#)\s*[\w\d-]*\b', part, re.IGNORECASE):
178 continue
179
180
181 if re.search(r'\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|plaza|plz|cir|circle|ter|terrace|commons|cmns)\s*(?:#?\d+)?$', part, re.IGNORECASE):
182 continue
183
184 filtered_parts.append(part)
185
186
187 if filtered_parts:
188
189 city_candidate = filtered_parts[-1]
190
191
192
193 city_candidate = re.sub(r'\b(suite|ste|unit|apt|#)\s*[\w\d-]+$', '', city_candidate, flags=re.IGNORECASE)
194
195 city_candidate = re.sub(r'\s*(building|bldg|center|ctr|office|tower|plaza|mall|complex)$', '', city_candidate, flags=re.IGNORECASE)
196 city_candidate = city_candidate.strip()
197
198
199 known_cities = ['Ashburn', 'Jacksonville', 'Santa Monica', 'Los Angeles', 'Whittier',
200 'Beverly Hills', 'Pasadena', 'Encino', 'Burbank', 'Glendale', 'Montebello',
201 'Hollywood', 'Lawndale', 'Sherman Oaks', 'Sterling', 'Leesburg']
202
203
204 for known_city in known_cities:
205 if known_city.lower() in city_candidate.lower():
206 components['city'] = known_city
207 break
208
209
210 if not components['city'] and city_candidate and not re.match(r'^\d', city_candidate):
211 components['city'] = city_candidate
212
213
214 if components['state'] == 'FL' and not components['city']:
215 if 'jacksonville' in original_address.lower():
216 components['city'] = 'Jacksonville'
217
218
219 if components['state'] == 'VA' and not components['city']:
220 if 'ashburn' in original_address.lower():
221 components['city'] = 'Ashburn'
222
223 return components
224
225def parse_location_type(location_str):
226 """Determine if the location is an address, Place ID, or coordinates"""
227 location_str = location_str.strip()
228
229
230 if location_str.startswith('ChIJ'):
231 return 'place_id', location_str
232
233
234
235 normalized = location_str.replace(' ', '')
236 coord_pattern = r'^-?\d+\.?\d*,-?\d+\.?\d*$'
237 if re.match(coord_pattern, normalized):
238 parts = normalized.split(',')
239 try:
240 lat = float(parts[0])
241 lng = float(parts[1])
242 return 'coordinates', (lat, lng)
243 except:
244 pass
245
246
247 return 'address', location_str
248
249
250
251
252async def geocode_with_address(page, address: str) -> dict:
253 Actor.log.info(f"Geocoding address: {address}...")
254 query = quote_plus(address)
255 maps_url = f"https://www.google.com/maps/search/{query}"
256 await page.goto(maps_url, wait_until='load', timeout=25000)
257 await asyncio.sleep(5)
258 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
259 if not coord_match:
260 return {'found': False, 'error': 'Could not find coordinates from address'}
261 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
262 html_content = await page.content()
263 ids = extract_place_ids(html_content)
264 return {'found': True, 'latitude': lat, 'longitude': lng, **ids}
265
266async def geocode_with_place_id(page, place_id: str) -> dict:
267 Actor.log.info(f"Geocoding Place ID: {place_id}...")
268 maps_url = f"https://www.google.com/maps/search/?api=1&query=some_text&query_place_id={place_id}"
269 await page.goto(maps_url, wait_until='load', timeout=25000)
270 await asyncio.sleep(5)
271 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
272 if not coord_match: return {'found': False, 'error': 'Could not find coordinates from Place ID'}
273 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
274 html_content = await page.content()
275 ids = extract_place_ids(html_content)
276 return {'found': True, 'latitude': lat, 'longitude': lng, 'place_id': place_id, **ids}
277
278async def scrape_competitors_at_coordinates(page, source_company_name: str, lat: float, lon: float,
279 search_query: str, max_competitors: int):
280 Actor.log.info(f"Scraping competitors at {lat},{lon} for '{search_query}'...")
281 query = quote_plus(search_query)
282 zoom_level = 11
283 url = f"https://www.google.com/maps/search/{query}/@{lat},{lon},{zoom_level}z"
284 Actor.log.info(f"Using zoom level {zoom_level} for wider area coverage")
285 await page.goto(url, wait_until='domcontentloaded', timeout=45000)
286
287
288 await page.evaluate('document.body.style.zoom="0.25"')
289 await asyncio.sleep(3)
290
291 selectors = ['[role="article"]', '.Nv2PK.THOPZb.CpccDe', '.lI9IFe', '.hfpxzc']
292 elems = []
293 for sel in selectors:
294 elems = await page.query_selector_all(sel)
295 if elems and len(elems) > 5:
296 Actor.log.info(f"Found {len(elems)} elements using selector: {sel}")
297 break
298
299 competitors = []
300 cleaned_source_name = re.sub(r'[^a-z0-9]', '', source_company_name.lower()) if source_company_name else ""
301
302 for element in elems[:max_competitors]:
303 element_html = await element.inner_html()
304 ids = extract_place_ids(element_html)
305 full_text = await element.inner_text()
306 lines = [ln.strip() for ln in full_text.split('\n') if ln.strip()]
307
308
309 business_name = None
310 for i, line in enumerate(lines):
311
312 if line.strip().isdigit() and len(line.strip()) <= 2:
313 continue
314
315 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now']:
316 continue
317
318 if is_valid_business_name(line):
319 business_name = clean_business_name(line)
320 if business_name:
321 break
322
323 if not business_name or any(c['business_name'] == business_name for c in competitors):
324 continue
325
326 is_source = False
327 if cleaned_source_name:
328 cleaned_competitor_name = re.sub(r'[^a-z0-9]', '', business_name.lower())
329 is_source = cleaned_source_name in cleaned_competitor_name or cleaned_competitor_name in cleaned_source_name
330
331
332 address = extract_address(lines)
333 address_components = parse_address_components(address)
334
335 competitor = {
336 'place_id': ids.get('place_id'),
337 'business_name': business_name,
338 'google_rank': len(competitors) + 1,
339 'is_source_business': is_source,
340 'average_rating': extract_rating(lines),
341 'number_of_reviews': extract_review_count(lines),
342 'main_category': determine_category(lines),
343 'address': address,
344 'city': address_components['city'],
345 'state': address_components['state'],
346 'zipcode': address_components['zipcode']
347 }
348 competitors.append(competitor)
349
350 return competitors
351
352
353
354
355async def main() -> None:
356 async with Actor:
357 actor_input = await Actor.get_input() or {}
358
359
360 locations_data = actor_input.get('locations', [])
361 search_query = actor_input.get('searchQuery', 'medical spa')
362 max_competitors = actor_input.get('maxCompetitors', 30)
363 proxy_configuration = actor_input.get('proxyConfiguration')
364
365
366 Actor.log.info(f"Received input: {len(locations_data)} locations")
367 Actor.log.info(f"Search query: {search_query}")
368 Actor.log.info(f"Max competitors per location: {max_competitors}")
369
370
371 locations = []
372 business_names = []
373 for item in locations_data:
374 if isinstance(item, dict):
375 locations.append(item.get('key', ''))
376 business_names.append(item.get('value', ''))
377 else:
378
379 locations.append(str(item))
380 business_names.append('')
381
382 if not locations:
383 Actor.log.error("No locations provided. Please add at least one location.")
384 await Actor.push_data({
385 'error': 'No locations provided',
386 'message': 'Please provide at least one location (address, Place ID, or coordinates)'
387 })
388 return
389
390 Actor.log.info(f"Processing {len(locations)} locations...")
391 Actor.log.info(f"Search query: '{search_query}'")
392 Actor.log.info(f"Max competitors: {max_competitors}")
393
394
395 total_results = 0
396
397
398 async with async_playwright() as p:
399
400 browser_options = {"headless": True}
401
402 if proxy_configuration and proxy_configuration.get('useApifyProxy'):
403 try:
404
405
406 Actor.log.info("Proxy requested but implementation needs to be verified for your Apify SDK version")
407 except Exception as e:
408 Actor.log.warning(f"Could not set up proxy: {e}")
409
410 Actor.log.info("Launching browser...")
411 browser = await p.chromium.launch(**browser_options)
412 page = await browser.new_page()
413
414
415 for idx, location_str in enumerate(locations):
416
417 business_name = business_names[idx] if idx < len(business_names) else None
418
419
420 if not location_str or not location_str.strip():
421 continue
422
423
424 location_type, location_data = parse_location_type(location_str)
425
426 Actor.log.info(f"\nProcessing location {idx + 1}/{len(locations)}: {location_str}")
427 if business_name:
428 Actor.log.info(f"Business name: {business_name}")
429
430 geo = {'found': False}
431
432 try:
433 if location_type == 'coordinates':
434 lat, lng = location_data
435 Actor.log.info(f"Using provided coordinates: {lat}, {lng}")
436 geo = {'found': True, 'latitude': lat, 'longitude': lng}
437 elif location_type == 'place_id':
438 geo = await geocode_with_place_id(page, location_data)
439 else:
440 geo = await geocode_with_address(page, location_data)
441 except Exception as e:
442 Actor.log.exception(f"Geocoding failed: {e}")
443 geo = {'found': False, 'error': str(e)}
444
445
446 competitors_list = []
447 if geo.get('found'):
448 try:
449 competitors_list = await scrape_competitors_at_coordinates(
450 page,
451 business_name,
452 geo['latitude'],
453 geo['longitude'],
454 search_query,
455 max_competitors
456 )
457 Actor.log.info(f"Found {len(competitors_list)} competitors for {business_name or location_str}")
458 except Exception as e:
459 Actor.log.exception(f"Competitor scraping failed: {e}")
460 else:
461 Actor.log.warning(f"Skipping competitor search - geocoding failed: {geo.get('error', 'Unknown error')}")
462
463
464 if competitors_list:
465
466 for competitor in competitors_list:
467 result = {
468
469 'competitor_name': competitor['business_name'],
470 'competitor_address': competitor['address'] or '',
471 'competitor_city': competitor['city'] or '',
472 'competitor_state': competitor['state'] or '',
473 'competitor_zipcode': competitor['zipcode'] or '',
474 'competitor_category': competitor['main_category'],
475 'competitor_rank': competitor['google_rank'],
476 'competitor_rating': competitor['average_rating'],
477 'competitor_reviews': competitor['number_of_reviews'],
478 'competitor_place_id': competitor['place_id'] or '',
479 'latitude': geo.get('latitude'),
480 'longitude': geo.get('longitude'),
481 'search_query': search_query,
482
483 'is_source_business': competitor['is_source_business'],
484 'input_location': location_str,
485 'input_business_name': business_name or ''
486 }
487 await Actor.push_data(result)
488 total_results += 1
489 else:
490
491 result = {
492
493 'competitor_name': '',
494 'competitor_address': '',
495 'competitor_city': '',
496 'competitor_state': '',
497 'competitor_zipcode': '',
498 'competitor_category': '',
499 'competitor_rank': None,
500 'competitor_rating': None,
501 'competitor_reviews': None,
502 'competitor_place_id': '',
503 'latitude': geo.get('latitude'),
504 'longitude': geo.get('longitude'),
505 'search_query': search_query,
506
507 'is_source_business': None,
508 'input_location': location_str,
509 'input_business_name': business_name or ''
510 }
511 await Actor.push_data(result)
512 total_results += 1
513
514 await browser.close()
515 Actor.log.info(f"\nScraping completed! Processed {len(locations)} locations and found {total_results} total results.")
516
517if __name__ == "__main__":
518 asyncio.run(main())