1"""
2Google Maps Lead Generation Scraper for Apify
3Uses Crawlee for better anti-detection and Apify's built-in proxy management
4"""
5
6from __future__ import annotations
7
8import asyncio
9import json
10import random
11import re
12from datetime import datetime, timedelta
13from typing import Any, Dict, List, Optional
14from urllib.parse import quote
15
16from apify import Actor
17from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
18from crawlee import Request
19
20
21from . import constants as const
22from .validators import validate_and_fail_if_invalid, validate_extracted_data, sanitize_user_agent
23from .lead_enrichment import LeadEnrichmentPipeline, enrich_leads_batch
24
25
26class GoogleMapsLeadScraper:
27 def __init__(self):
28 self.results = []
29 self.processed_businesses = set()
30
31
32 self.stats = {
33 'email_attempts': 0,
34 'email_successes': 0,
35 'email_failures': 0,
36 'phone_attempts': 0,
37 'phone_successes': 0,
38 'phone_failures': 0,
39 'website_attempts': 0,
40 'website_successes': 0,
41 'website_failures': 0,
42 }
43
44 async def extract_email_from_website(self, website_url: str, page, timeout_ms: int = 5000, retry_count: int = 0) -> str:
45 """
46 Extract email from business website with retry logic and multiple fallback methods
47
48 Args:
49 website_url: The URL to extract email from
50 page: Playwright page object
51 timeout_ms: Timeout in milliseconds
52 retry_count: Current retry attempt number
53
54 Returns:
55 Extracted email address or empty string
56 """
57 if not website_url or not website_url.startswith('http'):
58 return ''
59
60 self.stats['email_attempts'] += 1
61
62 try:
63 await page.goto(website_url, wait_until='domcontentloaded', timeout=timeout_ms)
64 await page.wait_for_timeout(const.EMAIL_QUICK_WAIT_MS)
65
66
67 email = await page.evaluate(f'''() => {{
68 const mailtoLinks = Array.from(document.querySelectorAll('{", ".join(const.EMAIL_LINK_SELECTORS)}'));
69 for (const link of mailtoLinks) {{
70 const href = link.href || link.getAttribute('href') || '';
71 if (href.startsWith('mailto:')) {{
72 const email = href.replace('mailto:', '').split('?')[0];
73 if (email && email.includes('@')) {{
74 return email;
75 }}
76 }}
77 }}
78 return '';
79 }}''')
80
81 if email:
82 self.stats['email_successes'] += 1
83 return email
84
85
86 email = await page.evaluate(f'''() => {{
87 const emailRegex = /{const.EMAIL_REGEX_PATTERN}/g;
88 const pageText = document.body.textContent || '';
89 const emails = pageText.match(emailRegex) || [];
90
91 // Filter out common non-business emails
92 const excludePatterns = {json.dumps(const.EMAIL_EXCLUDE_PATTERNS)};
93 const filtered = emails.filter(email => {{
94 const lower = email.toLowerCase();
95 return !excludePatterns.some(pattern => lower.includes(pattern));
96 }});
97
98 return filtered[0] || '';
99 }}''')
100
101 if email:
102 self.stats['email_successes'] += 1
103 return email
104
105
106 contact_link = await page.evaluate(f'''() => {{
107 const contactPatterns = {json.dumps(const.CONTACT_PAGE_PATTERNS)};
108 const links = Array.from(document.querySelectorAll('a'));
109
110 for (const link of links) {{
111 const text = (link.textContent || '').toLowerCase();
112 const href = (link.href || '').toLowerCase();
113
114 for (const pattern of contactPatterns) {{
115 if (text.includes(pattern) || href.includes(pattern)) {{
116 return link.href;
117 }}
118 }}
119 }}
120 return '';
121 }}''')
122
123 if contact_link:
124 try:
125 await page.goto(contact_link, wait_until='domcontentloaded', timeout=timeout_ms)
126 await page.wait_for_timeout(const.EMAIL_QUICK_WAIT_MS)
127
128
129 email = await page.evaluate(f'''() => {{
130 const emailRegex = /{const.EMAIL_REGEX_PATTERN}/g;
131 const pageText = document.body.textContent || '';
132 const emails = pageText.match(emailRegex) || [];
133
134 const excludePatterns = {json.dumps(const.EMAIL_EXCLUDE_PATTERNS)};
135 const filtered = emails.filter(email => {{
136 const lower = email.toLowerCase();
137 return !excludePatterns.some(pattern => lower.includes(pattern));
138 }});
139
140 return filtered[0] || '';
141 }}''')
142
143 if email:
144 self.stats['email_successes'] += 1
145 return email
146 except Exception as contact_error:
147 Actor.log.debug(f"Contact page navigation failed: {contact_error}")
148
149
150 if retry_count < const.EMAIL_RETRY_ATTEMPTS:
151 Actor.log.debug(f"Email extraction failed, retrying ({retry_count + 1}/{const.EMAIL_RETRY_ATTEMPTS})...")
152 await page.wait_for_timeout(const.EMAIL_RETRY_DELAY_MS)
153 return await self.extract_email_from_website(website_url, page, timeout_ms, retry_count + 1)
154
155 self.stats['email_failures'] += 1
156 return ''
157
158 except Exception as e:
159 Actor.log.debug(f"Email extraction error: {e}")
160 self.stats['email_failures'] += 1
161 return ''
162
163 async def extract_phone_from_business_page(self, page, business_url: str) -> str:
164 """Extract phone number from individual business page with international format support"""
165 self.stats['phone_attempts'] += 1
166
167 try:
168 await page.goto(business_url, wait_until='domcontentloaded', timeout=const.BUSINESS_PAGE_TIMEOUT_MS)
169 await page.wait_for_timeout(const.PHONE_EXTRACTION_WAIT_MS)
170
171
172 phone = await page.evaluate(f'''() => {{
173 // Current LeadLocator Pro business page phone selectors
174 const phoneSelectors = {json.dumps(const.PHONE_SELECTORS)};
175
176 for (const selector of phoneSelectors) {{
177 const phoneEl = document.querySelector(selector);
178 if (phoneEl) {{
179 const phoneText = phoneEl.textContent || phoneEl.getAttribute('aria-label') || phoneEl.href || '';
180 // Extract phone number pattern (international support)
181 const phoneMatch = phoneText.match(/[\\+]?[(]?[\\d\\s\\-\\(\\)]{{10,}}/);
182 if (phoneMatch) {{
183 return phoneMatch[0].trim();
184 }}
185 }}
186 }}
187
188 // Fallback: look for phone patterns in page text (international formats)
189 const pageText = document.body.textContent || '';
190
191 // Try multiple international phone patterns
192 const patterns = [
193 /\\+1[-\\.\\s]?\\(?\\d{{3}}\\)?[-\\.\\s]?\\d{{3}}[-\\.\\s]?\\d{{4}}/g, // US/CA with +1
194 /\\(?\\d{{3}}\\)?[-\\.\\s]?\\d{{3}}[-\\.\\s]?\\d{{4}}/g, // US/CA format
195 /\\+44\\s?7\\d{{3}}\\s?\\d{{3}}\\s?\\d{{3}}/g, // UK mobile
196 /\\+61\\s?\\d{{1}}\\s?\\d{{4}}\\s?\\d{{4}}/g, // Australian
197 /\\+\\d{{1,3}}[-\\.\\s]?\\(?\\d{{1,4}}\\)?[-\\.\\s]?\\d{{1,5}}[-\\.\\s]?\\d{{1,5}}/g, // Generic international
198 ];
199
200 for (const pattern of patterns) {{
201 const phoneMatch = pageText.match(pattern);
202 if (phoneMatch) {{
203 return phoneMatch[0].trim();
204 }}
205 }}
206
207 return '';
208 }}''')
209
210 if phone:
211 self.stats['phone_successes'] += 1
212 else:
213 self.stats['phone_failures'] += 1
214
215 return phone
216
217 except Exception as e:
218 Actor.log.debug(f"Phone extraction failed: {e}")
219 self.stats['phone_failures'] += 1
220 return ''
221
222 async def extract_website_from_business_page(self, page) -> str:
223 """Extract website URL from business page with enhanced selectors"""
224 self.stats['website_attempts'] += 1
225
226 try:
227 website = await page.evaluate(f'''() => {{
228 const websiteSelectors = {json.dumps(const.WEBSITE_SELECTORS)};
229
230 for (const selector of websiteSelectors) {{
231 const websiteEl = document.querySelector(selector);
232 if (websiteEl && websiteEl.href && !websiteEl.href.includes('google.com')) {{
233 return websiteEl.href;
234 }}
235 }}
236 return '';
237 }}''')
238
239 if website:
240 self.stats['website_successes'] += 1
241 else:
242 self.stats['website_failures'] += 1
243
244 return website
245
246 except Exception as e:
247 Actor.log.debug(f"Website extraction failed: {e}")
248 self.stats['website_failures'] += 1
249 return ''
250
251
252 async def extract_businesses_from_page(self, page) -> List[Dict]:
253 """Extract business listings using 2025-current 🌐📍 LeadLocator Pro selectors"""
254 return await page.evaluate('''() => {
255 const results = [];
256
257 // 2025 Current 🌐📍 LeadLocator Pro business container selectors
258 const businessContainerSelectors = [
259 'div.Nv2PK a', // Primary business container with link
260 'div.tH5CWc a', // Alternative container
261 'div.THOPZb a', // Another container variant
262 'a.hfpxzc' // Direct business link class
263 ];
264
265 const businessLinks = new Set();
266
267 // Find all business links using current selectors
268 businessContainerSelectors.forEach(selector => {
269 document.querySelectorAll(selector).forEach(el => {
270 if (el.href && el.href.includes('/maps/place/')) {
271 businessLinks.add(el);
272 }
273 });
274 });
275
276 console.log(`Found ${businessLinks.size} business links`);
277
278 businessLinks.forEach((element) => {
279 try {
280 // Find the business container (parent)
281 const container = element.closest('div.Nv2PK, div.tH5CWc, div.THOPZb') || element.parentElement;
282 if (!container) return;
283
284 // Extract business name using current selectors
285 let name = '';
286 const nameSelectors = [
287 '.qBF1Pd', // Current primary name class
288 '[class*="fontHeadlineSmall"]', // Fallback
289 '[aria-level="3"]', // Accessibility fallback
290 'h3' // HTML fallback
291 ];
292
293 for (const selector of nameSelectors) {
294 const nameEl = container.querySelector(selector);
295 if (nameEl && nameEl.textContent.trim()) {
296 name = nameEl.textContent.trim();
297 break;
298 }
299 }
300
301 if (!name || name.length < 2) return;
302
303 // Extract rating using current class
304 let rating = 0;
305 const ratingEl = container.querySelector('.MW4etd');
306 if (ratingEl) {
307 const ratingText = ratingEl.textContent || ratingEl.getAttribute('aria-label') || '';
308 const ratingMatch = ratingText.match(/([0-9][.,][0-9])/);
309 if (ratingMatch) {
310 rating = parseFloat(ratingMatch[1].replace(',', '.'));
311 }
312 }
313
314 // Extract review count using current class
315 let reviewCount = 0;
316 const reviewEl = container.querySelector('.UY7F9');
317 if (reviewEl) {
318 const reviewText = reviewEl.textContent || '';
319 const reviewMatch = reviewText.match(/([\\d,]+)/);
320 if (reviewMatch) {
321 reviewCount = parseInt(reviewMatch[1].replace(/,/g, ''));
322 }
323 }
324
325 // Extract address - look for address patterns in spans
326 let address = '';
327 const spans = container.querySelectorAll('span');
328 for (const span of spans) {
329 const text = span.textContent.trim();
330 // Look for address patterns
331 if ((text.includes('Street') || text.includes('Boulevard') || text.includes('Avenue') ||
332 text.includes('St') || text.includes('Blvd') || text.includes('Ave') ||
333 text.includes('Drive') || text.includes('Dr') || text.includes('Road') || text.includes('Rd')) &&
334 text.match(/\\d+/)) {
335 address = text.replace(/^[·•]\\s*/, '').trim(); // Remove leading bullet points
336 break;
337 }
338 }
339
340 // Extract category - look for business type descriptions
341 let category = '';
342 for (const span of spans) {
343 const text = span.textContent.trim();
344 // Look for category patterns (longer descriptive text)
345 if (text.length > 10 && text.length < 60 &&
346 !text.match(/^[0-9.(),·•\\s]+$/) && // Not just numbers/symbols
347 !text.includes('AM') && !text.includes('PM') && // Not hours
348 !address.includes(text) && // Not the address
349 !text.includes('Yamedhaminiwa') && // Skip translated terms
350 text !== name) { // Not the business name
351 category = text;
352 break;
353 }
354 }
355
356 // Extract price level
357 let priceLevel = '';
358 const priceEl = container.querySelector('.wcldff.fontHeadlineSmall.Cbys4b');
359 if (priceEl) {
360 priceLevel = priceEl.textContent.trim();
361 }
362
363 // Note: Phone and website are typically NOT available in search results
364 // They require individual business page navigation
365
366 results.push({
367 name: name,
368 rating: rating,
369 review_count: reviewCount,
370 address: address,
371 category: category,
372 phone: '', // Not available in search results
373 website: '', // Not available in search results
374 price_level: priceLevel,
375 hours: '', // Not available in search results
376 maps_url: element.href,
377 extracted_at: new Date().toISOString()
378 });
379
380 } catch (e) {
381 console.error('Error extracting business:', e);
382 }
383 });
384
385 return results;
386 }''')
387
388 async def scroll_results_panel(self, page) -> bool:
389 """Scroll the results panel using current Google Maps structure"""
390 try:
391 scrolled = await page.evaluate('''() => {
392 // Current 🌐📍 LeadLocator Pro scrollable container selectors
393 const scrollContainerSelectors = [
394 '[role="feed"]', // Primary results feed
395 'div[style*="overflow-y: scroll"]', // Scrollable div
396 'div[aria-label*="Results"]', // Results container by aria-label
397 '.m6QErb .DxyBCb', // Legacy fallback
398 '[data-value="Search results"]' // Data attribute approach
399 ];
400
401 for (const selector of scrollContainerSelectors) {
402 const container = document.querySelector(selector);
403 if (container) {
404 const scrollTarget = container.parentElement || container;
405 const oldScrollTop = scrollTarget.scrollTop;
406
407 // Fast scroll to bottom
408 scrollTarget.scrollTo({
409 top: scrollTarget.scrollHeight,
410 behavior: 'instant' // Fastest possible scroll
411 });
412
413 const newScrollTop = scrollTarget.scrollTop;
414 console.log(`Scrolled from ${oldScrollTop} to ${newScrollTop}`);
415 return newScrollTop > oldScrollTop;
416 }
417 }
418
419 // Last resort: page scroll
420 const oldY = window.scrollY;
421 window.scrollTo({
422 top: document.body.scrollHeight,
423 behavior: 'instant'
424 });
425 return window.scrollY > oldY;
426 }''')
427
428 if scrolled:
429 await page.wait_for_timeout(800)
430
431 return scrolled
432 except Exception as e:
433 Actor.log.debug(f"Scroll attempt failed: {e}")
434 return False
435
436
437
438async def main() -> None:
439 """Main entry point for the 🌐📍 LeadLocator Pro scraper"""
440
441 async with Actor:
442 Actor.log.info(f'{const.EMOJI_START} LeadLocator Pro Lead Scraper starting with Crawlee...')
443
444
445 actor_input = await Actor.get_input() or {}
446 Actor.log.info(f'📥 Input received: {json.dumps(actor_input, indent=2)}')
447
448
449 await validate_and_fail_if_invalid(actor_input)
450
451 searches = actor_input.get('searches', [])
452 max_results = actor_input.get('max_results_per_search', 20)
453 use_proxies = actor_input.get('use_proxies', True)
454 extract_emails = actor_input.get('extract_emails', False)
455 enhanced_extraction = actor_input.get('enhanced_extraction', False)
456 delay_between_requests = actor_input.get('delay_between_requests', 2)
457 minimum_rating = actor_input.get('minimum_rating', 0) / 10.0
458 minimum_reviews = actor_input.get('minimum_reviews', 0)
459 timeout_per_search = actor_input.get('timeout_per_search', 5) * 60
460
461
462 verify_emails = actor_input.get('verify_emails', False)
463 find_social_profiles = actor_input.get('find_social_profiles', False)
464 score_leads = actor_input.get('score_leads', False)
465 premium_enrichment = actor_input.get('premium_enrichment', False)
466
467
468 total_expected = sum(s.get('max_results', max_results) for s in searches if isinstance(s, dict))
469 is_qa_run = total_expected <= 50 and timeout_per_search <= 300
470
471
472 if premium_enrichment:
473 verify_emails = True
474 find_social_profiles = True
475 score_leads = True
476 Actor.log.info('💎 PREMIUM ENRICHMENT MODE: All premium features enabled')
477
478
479 Actor.log.info(f'⚙️ Configuration: extract_emails={extract_emails}, enhanced_extraction={enhanced_extraction}')
480 Actor.log.info(f'💎 Premium Features: verify_emails={verify_emails}, find_social={find_social_profiles}, score_leads={score_leads}')
481 Actor.log.info(f'⏱️ Timeout per search: {timeout_per_search}s, Expected results: {total_expected}')
482 if not extract_emails and not enhanced_extraction:
483 Actor.log.info(f'🚀 FAST MODE: Basic extraction only (completes in 2-3 min). Enable email/enhanced extraction for premium leads.')
484
485
486 if not searches:
487 query = actor_input.get('query', '')
488 location = actor_input.get('location', '')
489
490 if query:
491 searches = [{'query': query, 'location': location}]
492 Actor.log.info(f'✅ Converted single search: "{query}" in "{location}"')
493 else:
494 Actor.log.error('❌ No search queries provided!')
495 Actor.log.error('Expected format: {"searches": [{"query": "coffee shops", "location": "Seattle, WA"}]}')
496 raise ValueError('No search queries provided')
497
498 Actor.log.info(f'📊 Will process {len(searches)} search(es), max {max_results} results each')
499
500
501 proxy_configuration = None
502 if use_proxies:
503 try:
504 proxy_configuration = await Actor.create_proxy_configuration()
505 if proxy_configuration:
506 Actor.log.info('✅ Apify proxy configuration created')
507 else:
508 Actor.log.warning('⚠️ No proxy configuration available')
509 except Exception as e:
510 Actor.log.warning(f'⚠️ Proxy setup failed: {e}')
511
512
513 scraper = GoogleMapsLeadScraper()
514
515
516 import os
517 is_headless = os.getenv('APIFY_HEADLESS', '1') == '1'
518
519
520 is_local = not os.getenv('APIFY_IS_AT_HOME')
521
522
523 browser_launch_options = {}
524 if is_local:
525
526 browser_launch_options['args'] = ['--no-sandbox', '--disable-setuid-sandbox']
527
528
529 crawler = PlaywrightCrawler(
530 proxy_configuration=proxy_configuration,
531 browser_type='chromium',
532 headless=is_headless,
533 request_handler_timeout=timedelta(minutes=8),
534 max_request_retries=const.MAX_REQUEST_RETRIES,
535 browser_launch_options=browser_launch_options,
536 )
537
538
539 @crawler.router.default_handler
540 async def request_handler(context: PlaywrightCrawlingContext) -> None:
541 page = context.page
542 request = context.request
543
544 try:
545
546 search_info = json.loads(request.user_data.get('search_info', '{}'))
547 search_term = search_info.get('term', 'unknown')
548 max_results_for_search = search_info.get('max_results', max_results)
549
550
551 search_start_time = datetime.now()
552 search_timeout = timedelta(seconds=timeout_per_search)
553
554 Actor.log.info(f'🔍 Processing: {search_term} (timeout: {timeout_per_search}s)')
555
556
557 await page.set_viewport_size({'width': 1920, 'height': 1080})
558
559
560 navigation_successful = False
561 for attempt in range(2):
562 try:
563 await page.goto(request.url, wait_until='domcontentloaded', timeout=30000)
564 navigation_successful = True
565 Actor.log.info(f'✅ Navigation successful on attempt {attempt + 1}')
566 break
567 except Exception as nav_error:
568 Actor.log.warning(f'⚠️ Navigation attempt {attempt + 1} failed: {nav_error}')
569 if attempt < 1:
570 await page.wait_for_timeout(random.randint(2000, 5000))
571
572 if not navigation_successful:
573 Actor.log.error(f'❌ Failed to navigate to {request.url}')
574 return
575
576
577 await page.wait_for_timeout(3000)
578
579 results_found = False
580 selectors_to_try = [
581 '[role="feed"]',
582 'a[href*="/maps/place/"]',
583 'div[jsaction*="mouseover"]',
584 '[data-result-index]',
585 'div[role="article"]'
586 ]
587
588 for selector in selectors_to_try:
589 try:
590 await page.wait_for_selector(selector, timeout=const.SELECTOR_WAIT_MS)
591 Actor.log.info(f'{const.EMOJI_SUCCESS} Results found with selector: {selector}')
592 results_found = True
593 break
594 except Exception as e:
595 Actor.log.debug(f"Selector '{selector}' not found: {e}")
596 continue
597
598 if not results_found:
599 page_title = await page.title()
600 Actor.log.error(f'❌ No results found. Page title: {page_title}')
601 return
602
603
604 all_businesses = []
605 scroll_attempts = 0
606 max_scroll_attempts = 3
607 last_count = 0
608
609
610 max_businesses_to_extract = max_results_for_search
611 if is_qa_run:
612 max_businesses_to_extract = min(max_results_for_search, 50)
613
614
615 basic_businesses = []
616 while len(basic_businesses) < max_businesses_to_extract and scroll_attempts < max_scroll_attempts:
617
618 if datetime.now() - search_start_time > search_timeout:
619 Actor.log.warning(f'⏱️ Search timeout reached for: {search_term}')
620 break
621
622
623 businesses = await scraper.extract_businesses_from_page(page)
624
625
626 for business in businesses:
627
628 if business.get('rating', 0) < minimum_rating:
629 continue
630
631
632 if business.get('review_count', 0) < minimum_reviews:
633 continue
634
635 business_key = f"{business['name']}_{business.get('address', '')}"
636 if business_key not in scraper.processed_businesses:
637 scraper.processed_businesses.add(business_key)
638 Actor.log.info(f'📋 Found business: {business["name"]} ({business.get("rating", 0)}⭐ {business.get("review_count", 0)} reviews)')
639 basic_businesses.append(business)
640
641 Actor.log.info(f'📈 Found {len(basic_businesses)} businesses so far...')
642
643
644 if len(basic_businesses) >= max_businesses_to_extract:
645 break
646
647 if len(basic_businesses) == last_count:
648 scroll_attempts += 1
649 if scroll_attempts >= 2:
650 Actor.log.info('🔚 No new results after scrolling')
651 break
652 else:
653 scroll_attempts = 0
654 last_count = len(basic_businesses)
655
656
657 scrolled = await scraper.scroll_results_panel(page)
658 if not scrolled:
659 Actor.log.info('🔚 Cannot scroll further')
660 break
661
662
663 await page.wait_for_timeout(random.randint(300, 800))
664
665
666 all_businesses = []
667 time_remaining = search_timeout - (datetime.now() - search_start_time)
668
669 if enhanced_extraction and not is_qa_run:
670
671 high_value_businesses = sorted(basic_businesses,
672 key=lambda b: (b.get('rating', 0) * b.get('review_count', 0)), reverse=True)
673
674 Actor.log.info(f'💎 Phase 2: Enhancing high-value businesses with phone/website data')
675
676 enhancement_budget = min(10, len(high_value_businesses))
677 if time_remaining.total_seconds() > 60:
678
679 for i, business in enumerate(high_value_businesses[:enhancement_budget]):
680 enhancement_start = datetime.now()
681
682
683 if (datetime.now() - search_start_time).total_seconds() > timeout_per_search * 0.85:
684 Actor.log.warning(f'⏱️ Enhancement time limit reached, processing remaining {len(high_value_businesses) - i} businesses with basic data')
685 all_businesses.extend(high_value_businesses[i:])
686 break
687
688 try:
689 Actor.log.info(f'📞 Extracting phone/website for high-value business: {business["name"]} ({business.get("rating", 0)}⭐)')
690
691
692 phone = await scraper.extract_phone_from_business_page(page, business['maps_url'])
693 if phone:
694 business['phone'] = phone
695 Actor.log.info(f'✅ Found phone: {phone}')
696
697
698 if not business.get('website'):
699 website = await scraper.extract_website_from_business_page(page)
700 if website:
701 business['website'] = website
702 Actor.log.info(f'✅ Found website: {website}')
703
704 all_businesses.append(business)
705
706
707 await page.go_back()
708 await page.wait_for_timeout(500)
709
710 except Exception as e:
711 Actor.log.warning(f'❌ Enhancement failed for {business["name"]}: {e}')
712 all_businesses.append(business)
713
714
715 elapsed = (datetime.now() - enhancement_start).total_seconds()
716 if elapsed > 15:
717 Actor.log.warning(f'⏱️ Enhancement taking too long ({elapsed:.1f}s), switching to basic mode')
718 all_businesses.extend(high_value_businesses[i+1:])
719 break
720 else:
721 Actor.log.info('⚡ Insufficient time for enhancements, using basic data only')
722 all_businesses = basic_businesses
723 else:
724 Actor.log.info('⚡ Enhanced extraction disabled or QA mode, using basic data only')
725 all_businesses = basic_businesses
726
727
728 if extract_emails and all_businesses:
729 time_left = search_timeout - (datetime.now() - search_start_time)
730 if time_left.total_seconds() > 15:
731 Actor.log.info('📧 Phase 3: Extracting emails from business websites...')
732
733
734 businesses_with_websites = [b for b in all_businesses if b.get('website')]
735 businesses_with_phone = [b for b in businesses_with_websites if b.get('phone')]
736 businesses_without_phone = [b for b in businesses_with_websites if not b.get('phone')]
737
738
739 email_candidates = businesses_with_phone + businesses_without_phone
740
741 Actor.log.info(f'📧 Email extraction plan: {len(businesses_with_phone)} businesses with phones, {len(businesses_without_phone)} without phones')
742
743 email_budget = min(5, len(email_candidates))
744 if email_budget > 0:
745 time_per_email = min(4000, max(2000, time_left.total_seconds() * 1000 / email_budget))
746 else:
747 Actor.log.info('⚠️ No websites found to extract emails from')
748 time_per_email = 2000
749
750 for i, business in enumerate(email_candidates[:email_budget]):
751
752 if (datetime.now() - search_start_time).total_seconds() > timeout_per_search * 0.95:
753 Actor.log.warning('⏱️ Email extraction time limit reached')
754 break
755
756 try:
757 priority_marker = "🔥" if business.get('phone') else "📧"
758 Actor.log.info(f'{priority_marker} Extracting email for: {business["name"]}')
759
760 email = await scraper.extract_email_from_website(business['website'], page, int(time_per_email))
761 if email:
762 business['email'] = email
763 Actor.log.info(f'✅ Found email: {email}')
764 else:
765 business['email'] = ''
766
767
768 if i < email_budget - 1:
769 await page.wait_for_timeout(random.randint(500, 1000))
770
771 except Exception as e:
772 Actor.log.debug(f'Email extraction failed for {business["name"]}: {e}')
773 business['email'] = ''
774
775
776 for business in all_businesses:
777 if not business.get('email'):
778 business['email'] = ''
779
780 else:
781 Actor.log.info('⏱️ Insufficient time for email extraction, skipping')
782 for business in all_businesses:
783 business['email'] = ''
784
785
786 result = {
787 'search_query': search_info.get('query', ''),
788 'location': search_info.get('location', ''),
789 'search_term': search_term,
790 'businesses': all_businesses[:max_businesses_to_extract],
791 'total_results': len(all_businesses[:max_businesses_to_extract]),
792 'timestamp': datetime.now().isoformat(),
793 'scraping_info': {
794 'max_requested': max_results_for_search,
795 'extracted': len(all_businesses[:max_businesses_to_extract]),
796 'scroll_attempts': scroll_attempts,
797 'extraction_method': 'hybrid_smart_extraction',
798 'businesses_with_phones': len([b for b in all_businesses if b.get('phone')]),
799 'businesses_with_emails': len([b for b in all_businesses if b.get('email')]),
800 'businesses_with_websites': len([b for b in all_businesses if b.get('website')]),
801 'high_value_leads': len([b for b in all_businesses if b.get('phone') and b.get('email')])
802 }
803 }
804
805 await context.push_data(result)
806 scraper.results.append(result)
807
808
809 phones_count = len([b for b in result["businesses"] if b.get('phone')])
810 emails_count = len([b for b in result["businesses"] if b.get('email')])
811 websites_count = len([b for b in result["businesses"] if b.get('website')])
812 premium_leads = len([b for b in result["businesses"] if b.get('phone') and b.get('email')])
813
814 Actor.log.info(f'✅ Extracted {len(result["businesses"])} businesses for "{search_term}"')
815 Actor.log.info(f'💎 Lead Quality: {phones_count} phones | {emails_count} emails | {websites_count} websites | {premium_leads} premium leads (phone+email)')
816
817 if premium_leads > 0:
818 Actor.log.info(f'🔥 HIGH VALUE: {premium_leads} businesses have both phone AND email - these are your money-making leads!')
819
820 except Exception as e:
821 Actor.log.exception(f'❌ Error processing {request.url}: {e}')
822
823
824 start_urls = []
825 for search_item in searches:
826 try:
827 if isinstance(search_item, str):
828 query = search_item
829 location = ''
830 else:
831 query = search_item.get('query', '')
832 location = search_item.get('location', '')
833
834 search_term = f"{query} {location}".strip()
835 maps_url = f'https://www.google.com/maps/search/{quote(search_term)}'
836
837
838 request = Request.from_url(
839 maps_url,
840 user_data={
841 'search_info': json.dumps({
842 'query': query,
843 'location': location,
844 'term': search_term,
845 'max_results': search_item.get('max_results', max_results) if isinstance(search_item, dict) else max_results
846 })
847 }
848 )
849 start_urls.append(request)
850
851 except Exception as e:
852 Actor.log.error(f'❌ Error preparing search "{search_item}": {e}')
853
854
855 Actor.log.info('🚀 Starting crawler...')
856 await crawler.run(start_urls)
857
858
859 all_extracted_businesses = []
860 for result in scraper.results:
861 all_extracted_businesses.extend(result.get('businesses', []))
862
863
864 enriched_businesses = all_extracted_businesses
865
866
867 if (verify_emails or find_social_profiles or score_leads) and all_extracted_businesses:
868 Actor.log.info(f'💎 Starting Premium Enrichment for {len(all_extracted_businesses)} leads...')
869
870 enriched_businesses, _ = await enrich_leads_batch(
871 all_extracted_businesses,
872 verify_emails=verify_emails,
873 find_social=find_social_profiles,
874 score_leads=score_leads,
875 max_concurrent=3
876 )
877
878 Actor.log.info(f'✅ Enrichment complete: {len(enriched_businesses)} leads processed')
879
880
881 verified_count = len([b for b in enriched_businesses if b.get('email_verification', {}).get('is_deliverable')])
882 social_count = len([b for b in enriched_businesses if b.get('social_profiles')])
883 scored_count = len([b for b in enriched_businesses if b.get('lead_score')])
884 premium_count = len([b for b in enriched_businesses if b.get('is_premium_lead')])
885
886 Actor.log.info(f'📊 Enrichment Stats: {verified_count} verified emails | {social_count} with social | {scored_count} scored | {premium_count} premium leads')
887
888
889 Actor.log.info('💰 Processing charges...')
890
891
892
893 enriched_count = len([b for b in enriched_businesses
894 if b.get('email') or b.get('social_profiles') or b.get('lead_score')])
895 basic_count = len(enriched_businesses) - enriched_count
896
897
898 if basic_count > 0:
899 await Actor.charge(event_name='basic_lead', count=basic_count)
900 Actor.log.info(f" 💵 basic_lead: {basic_count} × $0.01 = ${basic_count * 0.01:.3f}")
901
902
903 if enriched_count > 0:
904 await Actor.charge(event_name='enriched_lead', count=enriched_count)
905 Actor.log.info(f" 💵 enriched_lead: {enriched_count} × $0.03 = ${enriched_count * 0.03:.3f}")
906
907
908 total_revenue = (basic_count * 0.01) + (enriched_count * 0.03)
909 Actor.log.info(f'💰 Total charges: ${total_revenue:.3f}')
910
911
912 for business in enriched_businesses:
913 await Actor.push_data(business)
914
915
916 total_businesses = len(enriched_businesses)
917 Actor.log.info('=' * 60)
918 Actor.log.info(f'{const.EMOJI_INFO} SCRAPING COMPLETED')
919 Actor.log.info(f'{const.EMOJI_SUCCESS} Searches completed: {len(scraper.results)}/{len(searches)}')
920 Actor.log.info(f'{const.EMOJI_EXTRACTION} Total businesses extracted: {total_businesses}')
921
922
923 if scraper.stats['email_attempts'] > 0:
924 email_success_rate = (scraper.stats['email_successes'] / scraper.stats['email_attempts']) * 100
925 Actor.log.info(f'{const.EMOJI_EMAIL} Email extraction: {scraper.stats["email_successes"]}/{scraper.stats["email_attempts"]} ({email_success_rate:.1f}% success rate)')
926
927 if scraper.stats['phone_attempts'] > 0:
928 phone_success_rate = (scraper.stats['phone_successes'] / scraper.stats['phone_attempts']) * 100
929 Actor.log.info(f'{const.EMOJI_PHONE} Phone extraction: {scraper.stats["phone_successes"]}/{scraper.stats["phone_attempts"]} ({phone_success_rate:.1f}% success rate)')
930
931 if scraper.stats['website_attempts'] > 0:
932 website_success_rate = (scraper.stats['website_successes'] / scraper.stats['website_attempts']) * 100
933 Actor.log.info(f'{const.EMOJI_WEBSITE} Website extraction: {scraper.stats["website_successes"]}/{scraper.stats["website_attempts"]} ({website_success_rate:.1f}% success rate)')
934
935 Actor.log.info('=' * 60)
936
937
938if __name__ == "__main__":
939 asyncio.run(main())