Google Maps Competitor Analyzer avatar
Google Maps Competitor Analyzer

Under maintenance

Pricing

$29.00 / 1,000 results

Go to Store
Google Maps Competitor Analyzer

Google Maps Competitor Analyzer

Under maintenance

Developed by

Brian Pyatt

Brian Pyatt

Maintained by Community

Find competitors near any business location using Google Maps. Enter addresses, Place IDs, or coordinates to discover nearby businesses with ratings, reviews, and rankings. Perfect for market research, site selection, and competitive analysis. Supports bulk locations and custom search terms

0.0 (0)

Pricing

$29.00 / 1,000 results

0

Total users

5

Monthly users

5

Runs succeeded

0%

Last modified

16 hours ago

.dockerignore

.git
.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Visual Studio Code
# Ignores the folder created by VS Code when changing workspace settings, doing debugger
# configuration, etc. Can be commented out to share Workspace Settings within a team
.vscode
# Zed editor
# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
# to share Project Settings within a team
.zed

.gitignore

.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Visual Studio Code
# Ignores the folder created by VS Code when changing workspace settings, doing debugger
# configuration, etc. Can be commented out to share Workspace Settings within a team
.vscode
# Zed editor
# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
# to share Project Settings within a team
.zed
# Added by Apify CLI
node_modules

Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-playwright:3.13
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Install Playwright and its dependencies
RUN playwright install-deps && \
playwright install
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q src/
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

requirements.txt

1apify-client
2pandas
3requests
4playwright

.actor/actor.json

{
"actorSpecification": 1,
"name": "google-maps-competitor-analyzer",
"title": "Google Maps Competitor Analyzer",
"description": "Find competitors near any business location using Google Maps. Enter addresses, Place IDs, or coordinates to discover nearby businesses with ratings, reviews, and rankings. Perfect for market research, site selection, and competitive analysis. Supports bulk locations and custom search terms.",
"version": "1.2",
"buildTag": "latest",
"meta": {
"templateId": "python-playwright"
},
"input": "./input_schema.json",
"dockerfile": "../Dockerfile"
}

.actor/input_schema.json

{
"title": "Google Maps Competitor Analyzer",
"description": "Find competitors near any business location using Google Maps. Enter addresses, Place IDs, or coordinates to discover nearby businesses with ratings, reviews, and rankings. Perfect for market research, site selection, and competitive analysis. Supports bulk locations and custom search terms.",
"type": "object",
"schemaVersion": 1,
"properties": {
"searchQuery": {
"title": "Search Niche/Keyword (e.g., Medical Spas, Mexican Restuarants, Coffee Shops)",
"type": "string",
"description": "What type of businesses to search for. Be specific for best results.",
"default": "medical spa",
"editor": "textfield",
"prefill": "medical spa"
},
"locations": {
"title": "Business Locations- Left Side Required (Address, Place ID, or Coordinates), Right Side Optional (Business Name)",
"type": "array",
"description": "LEFT FIELD (REQUIRED): Enter ONE of the following: 1) Full street address (123 Main St, Beverly Hills, CA 90210), 2) Google Place ID (ChIJ2eUgeAK6j4ARbn5u_wAGqWA), or 3) GPS coordinates without spaces (34.0522,-118.2437). RIGHT FIELD (OPTIONAL): Business name to identify your location in results.",
"editor": "keyValue",
"prefill": [
{
"key": "123 Main St, Beverly Hills, CA 90210",
"value": "My Med Spa"
},
{
"key": "ChIJ2eUgeAK6j4ARbn5u_wAGqWA",
"value": "Competitor Spa"
},
{
"key": "34.0522,-118.2437",
"value": ""
}
]
},
"maxCompetitors": {
"title": "Maximum Competitors per Location",
"type": "integer",
"description": "How many competitors to return per location. Google Maps typically shows 20-30 businesses in an area.",
"default": 30,
"minimum": 1,
"maximum": 30,
"editor": "number"
},
"proxyConfiguration": {
"title": "Proxy Configuration",
"type": "object",
"description": "Use residential proxies to avoid blocks. Recommended for multiple locations.",
"editor": "proxy",
"prefill": {
"useApifyProxy": true
}
}
},
"required": ["searchQuery", "locations"]
}

.actor/README.MD

# Google Maps Competitor Analyzer
Find and analyze competitors for any business location using Google Maps data. Simply provide an address, Google Place ID, or coordinates, and this actor will discover nearby competing businesses with detailed information.
## 🎯 Use Cases
- **Market Research**: Analyze competition before opening a new location
- **Site Selection**: Compare competitor density across multiple potential locations
- **Lead Generation**: Find businesses in specific niches for B2B outreach
- **Competitive Analysis**: Track ratings and reviews of competitors
- **Location Scouting**: Identify market gaps and opportunities
## 📊 What You Get
For each competitor found:
- Business name and Google rank
- Average rating and review count
- Business category
- Full address
- Google Place ID and CID
- Identification if it's your business
## 🔧 Input
### Search Query
Enter the type of businesses to search for:
- `medical spa`
- `mexican restaurant`
- `coffee shop`
- `dental office`
- etc.
### Locations
Enter locations in any of these formats:
**Full Address:**
123 Main St, Beverly Hills, CA 90210
**Google Place ID:**
ChIJ2eUgeAK6j4ARbn5u_wAGqWA
**GPS Coordinates:**
34.0522,-118.2437
### Business Name (Optional)
Add your business name to identify it in the results. The actor will mark it with `is_source_business: true` if found.
## 📤 Output
Results are formatted as one row per competitor, perfect for CSV export:
```json
{
"input_location": "123 Main St, Beverly Hills, CA",
"input_business_name": "My Med Spa",
"search_query": "medical spa",
"latitude": 34.0522,
"longitude": -118.2437,
"competitor_name": "Luxury Spa",
"competitor_rank": 1,
"competitor_rating": 4.8,
"competitor_reviews": 245,
"competitor_category": "Medical spa",
"competitor_address": "456 Oak St",
"is_source_business": false
}
⚡ Features
Bulk Processing: Analyze multiple locations in one run
Smart Detection: Automatically identifies address vs Place ID vs coordinates
CSV Ready: Flattened output structure for easy Excel/CSV export
Source Identification: Marks your business if found in results
Flexible Search: Customize search terms for any business type
🚀 Example Run
Input:
Search: "coffee shops"
Locations:
350 5th Ave, New York, NY | Starbucks
ChIJ2eUgeAK6j4ARbn5u_wAGqWA |
40.7128,-74.0060 | My Coffee Shop
Output:
Up to 30 competitors per location with full details
⚠️ Limitations
Maximum ~30 competitors per location (Google Maps limitation)
Requires stable internet connection
Best results with specific search terms
Coordinates must be in decimal format without spaces
💡 Tips
Use specific search terms for better results
Include business names to track your locations
Try different search terms to find all competitors
Export to CSV for easy analysis in Excel
🐛 Support
Found an issue or have a suggestion? Please create an issue on the actor page.

.claude/settings.local.json

{
"permissions": {
"allow": [
"Bash(grep:*)",
"Bash(echo:*)",
"Bash(python:*)",
"Bash(cp:*)"
],
"deny": []
}
}

src/main.py

1import asyncio
2import re
3from urllib.parse import quote_plus
4from apify import Actor
5from playwright.async_api import async_playwright, TimeoutError
6
7# ======================================================================
8# HELPER FUNCTIONS v1.2
9# ======================================================================
10def is_valid_business_name(name):
11 if not name or len(name.strip()) < 1: return False
12 if name.strip().startswith('"') and name.strip().endswith('"'): return False
13 name_lower = name.strip().lower()
14 if len(name_lower) > 80: return False
15 # Don't reject single digits - they might be actual business names
16 # But reject common UI elements
17 if name_lower in ['x', '+', '-', '·', '|', '/', '\\', '^', '*']: return False
18 if name_lower.startswith(('·', '$', '#', '@', 'http', 'sponsored')): return False
19 # Skip if it looks like an address (starts with number and has street suffix)
20 if re.match(r'^\d+\s+\w+\s+(st|street|ave|rd|dr|blvd)', name_lower, re.IGNORECASE): return False
21 skip_patterns = ['google maps', 'search results', 'filter by', 'sort by', 'reviews', 'directions', 'open now', 'closed']
22 if any(pattern in name_lower for pattern in skip_patterns): return False
23 return True
24
25def clean_business_name(name):
26 if not name: return ""
27 cleaned = name.strip().rstrip('&|\\').strip()
28 cleanup_patterns = [
29 r'\s*·.*$', r'^\s*\d+\.\s*', r'\s*\(\d+\)\s*$', r'\s*\d+\s*reviews?$',
30 r'\s*\$+.*$', r'\s*Open\s*(24\s*hours?|now).*$', r'\s*Closed.*$', r'\s*Medical spa.*$'
31 ]
32 for pattern in cleanup_patterns:
33 cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
34 return cleaned.strip()
35
36def extract_rating(lines):
37 for line in lines:
38 match = re.search(r'\b(\d\.\d)\b', line)
39 if match:
40 try: return float(match.group(1))
41 except: continue
42 return None
43
44def extract_review_count(lines):
45 for line in lines:
46 match = re.search(r'\((\d{1,5})\)', line)
47 if match:
48 try: return int(match.group(1))
49 except: continue
50 return None
51
52def extract_address(lines):
53 # Try multiple patterns to find address
54 address_parts = []
55 found_address_start = False
56
57 for i, line in enumerate(lines):
58 # Skip very short lines (likely not addresses)
59 if len(line) < 5:
60 continue
61
62 # Skip common non-address elements
63 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now', 'temporarily closed']:
64 continue
65
66 # Check if this line contains rating or review count (not part of address)
67 if re.match(r'^[\d.]+$', line) or re.match(r'^\(\d+\)$', line):
68 continue
69
70 # Look for common address patterns
71 has_street_pattern = re.search(r'\d+.*?(?:st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|suite|ste|unit|apt|#)', line, re.IGNORECASE)
72 has_state_zip = re.search(r'\b[A-Z]{2}\s+\d{5}\b', line)
73 has_po_box = re.search(r'(p\.?o\.?\s*box|pmb)\s*\d+', line, re.IGNORECASE)
74
75 if has_street_pattern or has_state_zip or has_po_box:
76 # Found start of address
77 found_address_start = True
78 # Clean up the line
79 cleaned_line = line.split('·')[-1].strip()
80 # Remove common prefixes
81 cleaned_line = re.sub(r'^(Open|Closed|Hours|Directions).*?·\s*', '', cleaned_line, flags=re.IGNORECASE)
82 address_parts.append(cleaned_line)
83 elif found_address_start and i < len(lines) - 1:
84 # This might be a continuation of the address (like city, state, zip on next line)
85 # Check if it's not a category or other non-address info
86 if not re.search(r'(medical spa|wellness center|aesthetic|dermatology|clinic|reviews?|rating|open|closed|hours|directions|website|call)', line, re.IGNORECASE):
87 # Check if it might be city/state/zip
88 if re.search(r'\b[A-Z]{2}\b', line) or re.search(r'\b\d{5}\b', line) or len(line) < 50:
89 address_parts.append(line.strip())
90 else:
91 # Probably not part of address anymore
92 break
93
94 # Join address parts
95 if address_parts:
96 # Join with comma if parts don't already have commas
97 full_address = address_parts[0]
98 for part in address_parts[1:]:
99 if not full_address.endswith(',') and not part.startswith(','):
100 full_address += ', ' + part
101 else:
102 full_address += ' ' + part
103 return full_address.strip()
104
105 return ""
106
107def determine_category(lines):
108 text = ' '.join(lines).lower()
109 if 'dermatology' in text: return "Dermatology clinic"
110 if any(x in text for x in ['wellness', 'wellness center']): return "Wellness center"
111 if any(x in text for x in ['aesthetic', 'aesthetics']): return "Aesthetic clinic"
112 return "Medical spa"
113
114def extract_place_ids(html_content: str) -> dict:
115 ids = {'place_id': None, 'cid': None}
116 place_id_match = re.search(r'(ChIJ[A-Za-z0-9_-]+)', html_content)
117 if place_id_match:
118 ids['place_id'] = place_id_match.group(1)
119 cid_match = re.search(r'(0x[a-f0-9:]+)', html_content)
120 if cid_match:
121 ids['cid'] = cid_match.group(1)
122 return ids
123
124def parse_address_components(address):
125 """Extract city, state, and zipcode from address string"""
126 if not address:
127 return {'city': '', 'state': '', 'zipcode': ''}
128
129 components = {'city': '', 'state': '', 'zipcode': ''}
130 original_address = address
131
132 # Common Virginia cities for this data
133 known_va_cities = ['Ashburn', 'Sterling', 'Leesburg', 'Herndon', 'Reston', 'Chantilly', 'Fairfax']
134
135 # First, try to extract ZIP code (5 digits, optionally followed by -4 digits)
136 # Look for standalone 5-digit numbers that aren't at the beginning of the address
137 zip_match = re.search(r'\b(?<!\d)(\d{5})(?:-\d{4})?\b(?!\s*[A-Za-z]+\s+(?:St|Street|Ave|Avenue|Rd|Road|Dr|Drive|Blvd|Boulevard|Way|Lane|Ln|Pkwy|Parkway|Ct|Court|Pl|Place))', address)
138 if zip_match:
139 # Check if this 5-digit number is actually a street number by looking at what follows
140 after_match = address[zip_match.end():].strip()
141 # If it's followed by a street name, it's not a zip
142 if not re.match(r'^[A-Za-z]+\s+(?:St|Street|Ave|Avenue|Rd|Road|Dr|Drive|Blvd|Boulevard|Way|Lane|Ln|Pkwy|Parkway|Ct|Court|Pl|Place)', after_match, re.IGNORECASE):
143 components['zipcode'] = zip_match.group(1)
144 address = address.replace(zip_match.group(0), '').strip().rstrip(',')
145
146 # Try to extract state (2 letter abbreviation)
147 state_pattern = r'\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC)\b'
148
149 # Look for state abbreviation, preferring ones near the end
150 state_matches = list(re.finditer(state_pattern, address, re.IGNORECASE))
151 if state_matches:
152 # Use the last state match (most likely to be the actual state)
153 state_match = state_matches[-1]
154 components['state'] = state_match.group(1).upper()
155 # Remove state and clean up
156 address = address[:state_match.start()] + address[state_match.end():]
157 address = address.strip().rstrip(',').strip()
158
159 # For Ashburn area addresses, if no state found but we have Ashburn, assume VA
160 if not components['state'] and any(city in original_address for city in known_va_cities):
161 components['state'] = 'VA'
162
163 # Extract city - look for known cities first
164 for known_city in known_va_cities:
165 if known_city.lower() in original_address.lower():
166 components['city'] = known_city
167 break
168
169 # If no known city found, try to extract from address structure
170 if not components['city']:
171 # Remove street address parts to find city
172 # Split by comma and analyze parts
173 parts = address.split(',')
174 for part in reversed(parts): # Start from the end
175 part = part.strip()
176
177 # Skip if it's a street address (starts with number)
178 if re.match(r'^\d+\s+', part):
179 continue
180
181 # Skip if it contains suite/unit/building info
182 if re.search(r'\b(suite|ste|unit|apt|building|bldg|floor|fl|#)\s*[\w\d-]*\b', part, re.IGNORECASE):
183 continue
184
185 # Skip if it ends with street type
186 if re.search(r'\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|plaza|plz|cir|circle|ter|terrace|commons|cmns)\s*(?:#?\d+)?$', part, re.IGNORECASE):
187 continue
188
189 # This might be the city
190 if part and not re.match(r'^\d', part):
191 components['city'] = part
192 break
193
194 # Special handling for specific cases
195 if components['state'] == 'FL' and not components['city']:
196 if 'jacksonville' in original_address.lower():
197 components['city'] = 'Jacksonville'
198
199 if components['state'] == 'CA' and not components['city']:
200 ca_cities = ['Los Angeles', 'Santa Monica', 'Beverly Hills', 'Pasadena', 'Whittier',
201 'Encino', 'Burbank', 'Glendale', 'Hollywood', 'Sherman Oaks']
202 for city in ca_cities:
203 if city.lower() in original_address.lower():
204 components['city'] = city
205 break
206
207 return components
208
209def parse_location_type(location_str):
210 """Determine if the location is an address, Place ID, or coordinates"""
211 location_str = location_str.strip()
212
213 # Check if it's a Place ID (starts with ChIJ)
214 if location_str.startswith('ChIJ'):
215 return 'place_id', location_str
216
217 # Check if it's coordinates (format: lat,lng) - now more forgiving
218 # Remove spaces around comma for better matching
219 normalized = location_str.replace(' ', '')
220 coord_pattern = r'^-?\d+\.?\d*,-?\d+\.?\d*$'
221 if re.match(coord_pattern, normalized):
222 parts = normalized.split(',')
223 try:
224 lat = float(parts[0])
225 lng = float(parts[1])
226 return 'coordinates', (lat, lng)
227 except:
228 pass
229
230 # Otherwise, treat as address
231 return 'address', location_str
232
233# ======================================================================
234# ACTOR SCRAPING LOGIC
235# ======================================================================
236async def geocode_with_address(page, address: str) -> dict:
237 Actor.log.info(f"Geocoding address: {address}...")
238 query = quote_plus(address)
239 maps_url = f"https://www.google.com/maps/search/{query}"
240 await page.goto(maps_url, wait_until='load', timeout=25000)
241 await asyncio.sleep(5)
242 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
243 if not coord_match:
244 return {'found': False, 'error': 'Could not find coordinates from address'}
245 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
246 html_content = await page.content()
247 ids = extract_place_ids(html_content)
248 return {'found': True, 'latitude': lat, 'longitude': lng, **ids}
249
250async def geocode_with_place_id(page, place_id: str) -> dict:
251 Actor.log.info(f"Geocoding Place ID: {place_id}...")
252 maps_url = f"https://www.google.com/maps/search/?api=1&query=some_text&query_place_id={place_id}"
253 await page.goto(maps_url, wait_until='load', timeout=25000)
254 await asyncio.sleep(5)
255 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
256 if not coord_match: return {'found': False, 'error': 'Could not find coordinates from Place ID'}
257 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
258 html_content = await page.content()
259 ids = extract_place_ids(html_content)
260 return {'found': True, 'latitude': lat, 'longitude': lng, 'place_id': place_id, **ids}
261
262async def scrape_competitors_at_coordinates(page, source_company_name: str, lat: float, lon: float,
263 search_query: str, max_competitors: int):
264 Actor.log.info(f"Scraping competitors at {lat},{lon} for '{search_query}'...")
265 query = quote_plus(search_query)
266 zoom_level = 11 # Wider area view to show more businesses
267 url = f"https://www.google.com/maps/search/{query}/@{lat},{lon},{zoom_level}z"
268 Actor.log.info(f"Using zoom level {zoom_level} for wider area coverage")
269 await page.goto(url, wait_until='domcontentloaded', timeout=45000)
270
271 # Always use 25% zoom for maximum results
272 await page.evaluate('document.body.style.zoom="0.25"')
273 await asyncio.sleep(3)
274
275 selectors = ['[role="article"]', '.Nv2PK.THOPZb.CpccDe', '.lI9IFe', '.hfpxzc']
276 elems = []
277 for sel in selectors:
278 elems = await page.query_selector_all(sel)
279 if elems and len(elems) > 5:
280 Actor.log.info(f"Found {len(elems)} elements using selector: {sel}")
281 break
282
283 competitors = []
284 cleaned_source_name = re.sub(r'[^a-z0-9]', '', source_company_name.lower()) if source_company_name else ""
285
286 for element in elems[:max_competitors]:
287 element_html = await element.inner_html()
288 ids = extract_place_ids(element_html)
289 full_text = await element.inner_text()
290 lines = [ln.strip() for ln in full_text.split('\n') if ln.strip()]
291
292 # Look for the actual business name, skipping numbers and UI elements
293 business_name = None
294 for i, line in enumerate(lines):
295 # Skip single digits (likely list numbers)
296 if line.strip().isdigit() and len(line.strip()) <= 2:
297 continue
298 # Skip common UI elements
299 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now']:
300 continue
301 # Try to validate and clean the line as a business name
302 if is_valid_business_name(line):
303 business_name = clean_business_name(line)
304 if business_name: # Make sure cleaning didn't result in empty string
305 break
306
307 if not business_name or any(c['business_name'] == business_name for c in competitors):
308 continue
309
310 is_source = False
311 if cleaned_source_name:
312 cleaned_competitor_name = re.sub(r'[^a-z0-9]', '', business_name.lower())
313 is_source = cleaned_source_name in cleaned_competitor_name or cleaned_competitor_name in cleaned_source_name
314
315 # Extract address and parse components
316 address = extract_address(lines)
317 address_components = parse_address_components(address)
318
319 competitor = {
320 'place_id': ids.get('place_id'),
321 'business_name': business_name,
322 'google_rank': len(competitors) + 1,
323 'is_source_business': is_source,
324 'average_rating': extract_rating(lines),
325 'number_of_reviews': extract_review_count(lines),
326 'main_category': determine_category(lines),
327 'address': address,
328 'city': address_components['city'],
329 'state': address_components['state'],
330 'zipcode': address_components['zipcode']
331 }
332 competitors.append(competitor)
333
334 return competitors
335
336# ======================================================================
337# MAIN ACTOR FUNCTION
338# ======================================================================
339async def main() -> None:
340 async with Actor:
341 actor_input = await Actor.get_input() or {}
342
343 # Parse input from new schema
344 locations_data = actor_input.get('locations', [])
345 search_query = actor_input.get('searchQuery', 'medical spa')
346 max_competitors = actor_input.get('maxCompetitors', 30)
347 proxy_configuration = actor_input.get('proxyConfiguration')
348
349 # Log input for debugging
350 Actor.log.info(f"Received input: {len(locations_data)} locations")
351 Actor.log.info(f"Search query: {search_query}")
352 Actor.log.info(f"Max competitors per location: {max_competitors}")
353
354 # Convert key-value pairs to locations and business names
355 locations = []
356 business_names = []
357 for item in locations_data:
358 if isinstance(item, dict):
359 locations.append(item.get('key', ''))
360 business_names.append(item.get('value', ''))
361 else:
362 # Fallback for string input
363 locations.append(str(item))
364 business_names.append('')
365
366 if not locations:
367 Actor.log.error("No locations provided. Please add at least one location.")
368 await Actor.push_data({
369 'error': 'No locations provided',
370 'message': 'Please provide at least one location (address, Place ID, or coordinates)'
371 })
372 return
373
374 Actor.log.info(f"Processing {len(locations)} locations...")
375 Actor.log.info(f"Search query: '{search_query}'")
376 Actor.log.info(f"Max competitors: {max_competitors}")
377
378 # Track total results pushed
379 total_results = 0
380
381 # Launch browser
382 async with async_playwright() as p:
383 # Handle proxy if configured
384 browser_options = {"headless": True}
385
386 if proxy_configuration and proxy_configuration.get('useApifyProxy'):
387 try:
388 # For Apify proxy, you might need to check the correct method
389 # This is a placeholder - check Apify docs for your SDK version
390 Actor.log.info("Proxy requested but implementation needs to be verified for your Apify SDK version")
391 except Exception as e:
392 Actor.log.warning(f"Could not set up proxy: {e}")
393
394 Actor.log.info("Launching browser...")
395 browser = await p.chromium.launch(**browser_options)
396 page = await browser.new_page()
397
398 # Process each location
399 for idx, location_str in enumerate(locations):
400 # Get business name from the paired value
401 business_name = business_names[idx] if idx < len(business_names) else None
402
403 # Skip empty locations
404 if not location_str or not location_str.strip():
405 continue
406
407 # Parse location type
408 location_type, location_data = parse_location_type(location_str)
409
410 Actor.log.info(f"\nProcessing location {idx + 1}/{len(locations)}: {location_str}")
411 if business_name:
412 Actor.log.info(f"Business name: {business_name}")
413
414 geo = {'found': False}
415
416 try:
417 if location_type == 'coordinates':
418 lat, lng = location_data
419 Actor.log.info(f"Using provided coordinates: {lat}, {lng}")
420 geo = {'found': True, 'latitude': lat, 'longitude': lng}
421 elif location_type == 'place_id':
422 geo = await geocode_with_place_id(page, location_data)
423 else: # address
424 geo = await geocode_with_address(page, location_data)
425 except Exception as e:
426 Actor.log.exception(f"Geocoding failed: {e}")
427 geo = {'found': False, 'error': str(e)}
428
429 # Scrape competitors if geocoding succeeded
430 competitors_list = []
431 if geo.get('found'):
432 try:
433 competitors_list = await scrape_competitors_at_coordinates(
434 page,
435 business_name,
436 geo['latitude'],
437 geo['longitude'],
438 search_query,
439 max_competitors
440 )
441 Actor.log.info(f"Found {len(competitors_list)} competitors for {business_name or location_str}")
442 except Exception as e:
443 Actor.log.exception(f"Competitor scraping failed: {e}")
444 else:
445 Actor.log.warning(f"Skipping competitor search - geocoding failed: {geo.get('error', 'Unknown error')}")
446
447 # Push results - FLATTENED for CSV
448 if competitors_list:
449 # One row per competitor
450 for competitor in competitors_list:
451 result = {
452 # Competitor details in requested order
453 'competitor_name': competitor['business_name'],
454 'competitor_address': competitor['address'] or '',
455 'competitor_city': competitor['city'] or '',
456 'competitor_state': competitor['state'] or '',
457 'competitor_zipcode': competitor['zipcode'] or '',
458 'competitor_category': competitor['main_category'],
459 'competitor_rank': competitor['google_rank'],
460 'competitor_rating': competitor['average_rating'],
461 'competitor_reviews': competitor['number_of_reviews'],
462 'competitor_place_id': competitor['place_id'] or '',
463 'latitude': geo.get('latitude'),
464 'longitude': geo.get('longitude'),
465 'search_query': search_query,
466 # Source business info at the end
467 'is_source_business': competitor['is_source_business'],
468 'input_location': location_str,
469 'input_business_name': business_name or ''
470 }
471 await Actor.push_data(result)
472 total_results += 1
473 else:
474 # No competitors found - still push location info
475 result = {
476 # Empty competitor details in requested order
477 'competitor_name': '',
478 'competitor_address': '',
479 'competitor_city': '',
480 'competitor_state': '',
481 'competitor_zipcode': '',
482 'competitor_category': '',
483 'competitor_rank': None,
484 'competitor_rating': None,
485 'competitor_reviews': None,
486 'competitor_place_id': '',
487 'latitude': geo.get('latitude'),
488 'longitude': geo.get('longitude'),
489 'search_query': search_query,
490 # Source business info at the end
491 'is_source_business': None,
492 'input_location': location_str,
493 'input_business_name': business_name or ''
494 }
495 await Actor.push_data(result)
496 total_results += 1
497
498 await browser.close()
499 Actor.log.info(f"\nScraping completed! Processed {len(locations)} locations and found {total_results} total results.")
500
501# Entry point
502if __name__ == "__main__":
503 asyncio.run(main())

src/main_backup.py

1import asyncio
2import re
3from urllib.parse import quote_plus
4from apify import Actor
5from playwright.async_api import async_playwright, TimeoutError
6
7# ======================================================================
8# HELPER FUNCTIONS v1.1
9# ======================================================================
10def is_valid_business_name(name):
11 if not name or len(name.strip()) < 1: return False
12 if name.strip().startswith('"') and name.strip().endswith('"'): return False
13 name_lower = name.strip().lower()
14 if len(name_lower) > 80: return False
15 # Don't reject single digits - they might be actual business names
16 # But reject common UI elements
17 if name_lower in ['x', '+', '-', '·', '|', '/', '\\', '^', '*']: return False
18 if name_lower.startswith(('·', '$', '#', '@', 'http', 'sponsored')): return False
19 # Skip if it looks like an address (starts with number and has street suffix)
20 if re.match(r'^\d+\s+\w+\s+(st|street|ave|rd|dr|blvd)', name_lower, re.IGNORECASE): return False
21 skip_patterns = ['google maps', 'search results', 'filter by', 'sort by', 'reviews', 'directions', 'open now', 'closed']
22 if any(pattern in name_lower for pattern in skip_patterns): return False
23 return True
24
25def clean_business_name(name):
26 if not name: return ""
27 cleaned = name.strip().rstrip('&|\\').strip()
28 cleanup_patterns = [
29 r'\s*·.*$', r'^\s*\d+\.\s*', r'\s*\(\d+\)\s*$', r'\s*\d+\s*reviews?$',
30 r'\s*\$+.*$', r'\s*Open\s*(24\s*hours?|now).*$', r'\s*Closed.*$', r'\s*Medical spa.*$'
31 ]
32 for pattern in cleanup_patterns:
33 cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
34 return cleaned.strip()
35
36def extract_rating(lines):
37 for line in lines:
38 match = re.search(r'\b(\d\.\d)\b', line)
39 if match:
40 try: return float(match.group(1))
41 except: continue
42 return None
43
44def extract_review_count(lines):
45 for line in lines:
46 match = re.search(r'\((\d{1,5})\)', line)
47 if match:
48 try: return int(match.group(1))
49 except: continue
50 return None
51
52def extract_address(lines):
53 # Try multiple patterns to find address
54 address_parts = []
55 found_address_start = False
56
57 for i, line in enumerate(lines):
58 # Skip very short lines (likely not addresses)
59 if len(line) < 5:
60 continue
61
62 # Skip common non-address elements
63 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now', 'temporarily closed']:
64 continue
65
66 # Check if this line contains rating or review count (not part of address)
67 if re.match(r'^[\d.]+$', line) or re.match(r'^\(\d+\)$', line):
68 continue
69
70 # Look for common address patterns
71 has_street_pattern = re.search(r'\d+.*?(?:st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|suite|ste|unit|apt|#)', line, re.IGNORECASE)
72 has_state_zip = re.search(r'\b[A-Z]{2}\s+\d{5}\b', line)
73 has_po_box = re.search(r'(p\.?o\.?\s*box|pmb)\s*\d+', line, re.IGNORECASE)
74
75 if has_street_pattern or has_state_zip or has_po_box:
76 # Found start of address
77 found_address_start = True
78 # Clean up the line
79 cleaned_line = line.split('·')[-1].strip()
80 # Remove common prefixes
81 cleaned_line = re.sub(r'^(Open|Closed|Hours|Directions).*?·\s*', '', cleaned_line, flags=re.IGNORECASE)
82 address_parts.append(cleaned_line)
83 elif found_address_start and i < len(lines) - 1:
84 # This might be a continuation of the address (like city, state, zip on next line)
85 # Check if it's not a category or other non-address info
86 if not re.search(r'(medical spa|wellness center|aesthetic|dermatology|clinic|reviews?|rating|open|closed|hours|directions|website|call)', line, re.IGNORECASE):
87 # Check if it might be city/state/zip
88 if re.search(r'\b[A-Z]{2}\b', line) or re.search(r'\b\d{5}\b', line) or len(line) < 50:
89 address_parts.append(line.strip())
90 else:
91 # Probably not part of address anymore
92 break
93
94 # Join address parts
95 if address_parts:
96 # Join with comma if parts don't already have commas
97 full_address = address_parts[0]
98 for part in address_parts[1:]:
99 if not full_address.endswith(',') and not part.startswith(','):
100 full_address += ', ' + part
101 else:
102 full_address += ' ' + part
103 return full_address.strip()
104
105 return ""
106
107def determine_category(lines):
108 text = ' '.join(lines).lower()
109 if 'dermatology' in text: return "Dermatology clinic"
110 if any(x in text for x in ['wellness', 'wellness center']): return "Wellness center"
111 if any(x in text for x in ['aesthetic', 'aesthetics']): return "Aesthetic clinic"
112 return "Medical spa"
113
114def extract_place_ids(html_content: str) -> dict:
115 ids = {'place_id': None, 'cid': None}
116 place_id_match = re.search(r'(ChIJ[A-Za-z0-9_-]+)', html_content)
117 if place_id_match:
118 ids['place_id'] = place_id_match.group(1)
119 cid_match = re.search(r'(0x[a-f0-9:]+)', html_content)
120 if cid_match:
121 ids['cid'] = cid_match.group(1)
122 return ids
123
124def parse_address_components(address):
125 """Extract city, state, and zipcode from address string"""
126 if not address:
127 return {'city': '', 'state': '', 'zipcode': ''}
128
129 components = {'city': '', 'state': '', 'zipcode': ''}
130 original_address = address
131
132 # First, try to extract ZIP code (5 digits, optionally followed by -4 digits)
133 # But make sure it's not at the beginning (like a street number)
134 zip_patterns = [
135 r',\s*(\d{5}(?:-\d{4})?)\s*$', # ZIP at end after comma
136 r'\s+(\d{5}(?:-\d{4})?)\s*$', # ZIP at end after space
137 r'\b(?<!^\d)(\d{5}(?:-\d{4})?)\b(?!.*\d{5})' # Last 5-digit number that's not at start
138 ]
139
140 for pattern in zip_patterns:
141 zip_match = re.search(pattern, address)
142 if zip_match:
143 components['zipcode'] = zip_match.group(1)
144 address = address.replace(zip_match.group(0), '').strip().rstrip(',')
145 break
146
147 # Try to extract state (2 letter abbreviation)
148 state_pattern = r'\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC)\b'
149
150 # Look for state abbreviation, preferring ones near the end
151 state_matches = list(re.finditer(state_pattern, address, re.IGNORECASE))
152 if state_matches:
153 # Use the last state match (most likely to be the actual state)
154 state_match = state_matches[-1]
155 components['state'] = state_match.group(1).upper()
156 # Remove state and clean up
157 address = address[:state_match.start()] + address[state_match.end():]
158 address = address.strip().rstrip(',').strip()
159
160 # Special handling for addresses that start with business name or complex location
161 # e.g., "Stone Springs MedSpa by LMG, Physicians Office Building, 24430 Stone Springs Blvd Suite 100A"
162 parts = address.split(',')
163
164 # Remove parts that are clearly not city names
165 filtered_parts = []
166 for part in parts:
167 part = part.strip()
168 # Skip empty parts
169 if not part:
170 continue
171
172 # Skip parts that are clearly street addresses
173 if re.match(r'^\d+\s+\w+', part):
174 continue
175
176 # Skip suite/unit/building descriptions
177 if re.search(r'\b(suite|ste|unit|apt|building|bldg|floor|fl|#)\s*[\w\d-]*\b', part, re.IGNORECASE):
178 continue
179
180 # Skip parts that end with street types (these are street names, not cities)
181 if re.search(r'\b(st|street|ave|avenue|rd|road|dr|drive|blvd|boulevard|way|lane|ln|pkwy|parkway|ct|court|pl|place|plaza|plz|cir|circle|ter|terrace|commons|cmns)\s*(?:#?\d+)?$', part, re.IGNORECASE):
182 continue
183
184 filtered_parts.append(part)
185
186 # Try to identify the city
187 if filtered_parts:
188 # The last filtered part is most likely the city
189 city_candidate = filtered_parts[-1]
190
191 # Clean up the city name
192 # Remove any remaining suite/unit info
193 city_candidate = re.sub(r'\b(suite|ste|unit|apt|#)\s*[\w\d-]+$', '', city_candidate, flags=re.IGNORECASE)
194 # Remove any trailing business/building identifiers
195 city_candidate = re.sub(r'\s*(building|bldg|center|ctr|office|tower|plaza|mall|complex)$', '', city_candidate, flags=re.IGNORECASE)
196 city_candidate = city_candidate.strip()
197
198 # Common city names in the data that we should recognize
199 known_cities = ['Ashburn', 'Jacksonville', 'Santa Monica', 'Los Angeles', 'Whittier',
200 'Beverly Hills', 'Pasadena', 'Encino', 'Burbank', 'Glendale', 'Montebello',
201 'Hollywood', 'Lawndale', 'Sherman Oaks', 'Sterling', 'Leesburg']
202
203 # Check if any known city is in the candidate
204 for known_city in known_cities:
205 if known_city.lower() in city_candidate.lower():
206 components['city'] = known_city
207 break
208
209 # If no known city found, use the candidate if it looks reasonable
210 if not components['city'] and city_candidate and not re.match(r'^\d', city_candidate):
211 components['city'] = city_candidate
212
213 # Special case: if we have FL state but no city, and "Jacksonville" might be in original address
214 if components['state'] == 'FL' and not components['city']:
215 if 'jacksonville' in original_address.lower():
216 components['city'] = 'Jacksonville'
217
218 # Special case: if we have VA state and Ashburn-related text
219 if components['state'] == 'VA' and not components['city']:
220 if 'ashburn' in original_address.lower():
221 components['city'] = 'Ashburn'
222
223 return components
224
225def parse_location_type(location_str):
226 """Determine if the location is an address, Place ID, or coordinates"""
227 location_str = location_str.strip()
228
229 # Check if it's a Place ID (starts with ChIJ)
230 if location_str.startswith('ChIJ'):
231 return 'place_id', location_str
232
233 # Check if it's coordinates (format: lat,lng) - now more forgiving
234 # Remove spaces around comma for better matching
235 normalized = location_str.replace(' ', '')
236 coord_pattern = r'^-?\d+\.?\d*,-?\d+\.?\d*$'
237 if re.match(coord_pattern, normalized):
238 parts = normalized.split(',')
239 try:
240 lat = float(parts[0])
241 lng = float(parts[1])
242 return 'coordinates', (lat, lng)
243 except:
244 pass
245
246 # Otherwise, treat as address
247 return 'address', location_str
248
249# ======================================================================
250# ACTOR SCRAPING LOGIC
251# ======================================================================
252async def geocode_with_address(page, address: str) -> dict:
253 Actor.log.info(f"Geocoding address: {address}...")
254 query = quote_plus(address)
255 maps_url = f"https://www.google.com/maps/search/{query}"
256 await page.goto(maps_url, wait_until='load', timeout=25000)
257 await asyncio.sleep(5)
258 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
259 if not coord_match:
260 return {'found': False, 'error': 'Could not find coordinates from address'}
261 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
262 html_content = await page.content()
263 ids = extract_place_ids(html_content)
264 return {'found': True, 'latitude': lat, 'longitude': lng, **ids}
265
266async def geocode_with_place_id(page, place_id: str) -> dict:
267 Actor.log.info(f"Geocoding Place ID: {place_id}...")
268 maps_url = f"https://www.google.com/maps/search/?api=1&query=some_text&query_place_id={place_id}"
269 await page.goto(maps_url, wait_until='load', timeout=25000)
270 await asyncio.sleep(5)
271 coord_match = re.search(r'@(-?\d+\.\d+),(-?\d+\.\d+)', page.url)
272 if not coord_match: return {'found': False, 'error': 'Could not find coordinates from Place ID'}
273 lat, lng = float(coord_match.group(1)), float(coord_match.group(2))
274 html_content = await page.content()
275 ids = extract_place_ids(html_content)
276 return {'found': True, 'latitude': lat, 'longitude': lng, 'place_id': place_id, **ids}
277
278async def scrape_competitors_at_coordinates(page, source_company_name: str, lat: float, lon: float,
279 search_query: str, max_competitors: int):
280 Actor.log.info(f"Scraping competitors at {lat},{lon} for '{search_query}'...")
281 query = quote_plus(search_query)
282 zoom_level = 11 # Wider area view to show more businesses
283 url = f"https://www.google.com/maps/search/{query}/@{lat},{lon},{zoom_level}z"
284 Actor.log.info(f"Using zoom level {zoom_level} for wider area coverage")
285 await page.goto(url, wait_until='domcontentloaded', timeout=45000)
286
287 # Always use 25% zoom for maximum results
288 await page.evaluate('document.body.style.zoom="0.25"')
289 await asyncio.sleep(3)
290
291 selectors = ['[role="article"]', '.Nv2PK.THOPZb.CpccDe', '.lI9IFe', '.hfpxzc']
292 elems = []
293 for sel in selectors:
294 elems = await page.query_selector_all(sel)
295 if elems and len(elems) > 5:
296 Actor.log.info(f"Found {len(elems)} elements using selector: {sel}")
297 break
298
299 competitors = []
300 cleaned_source_name = re.sub(r'[^a-z0-9]', '', source_company_name.lower()) if source_company_name else ""
301
302 for element in elems[:max_competitors]:
303 element_html = await element.inner_html()
304 ids = extract_place_ids(element_html)
305 full_text = await element.inner_text()
306 lines = [ln.strip() for ln in full_text.split('\n') if ln.strip()]
307
308 # Look for the actual business name, skipping numbers and UI elements
309 business_name = None
310 for i, line in enumerate(lines):
311 # Skip single digits (likely list numbers)
312 if line.strip().isdigit() and len(line.strip()) <= 2:
313 continue
314 # Skip common UI elements
315 if line.lower() in ['ad', 'ads', 'sponsored', 'open', 'closed', 'open now']:
316 continue
317 # Try to validate and clean the line as a business name
318 if is_valid_business_name(line):
319 business_name = clean_business_name(line)
320 if business_name: # Make sure cleaning didn't result in empty string
321 break
322
323 if not business_name or any(c['business_name'] == business_name for c in competitors):
324 continue
325
326 is_source = False
327 if cleaned_source_name:
328 cleaned_competitor_name = re.sub(r'[^a-z0-9]', '', business_name.lower())
329 is_source = cleaned_source_name in cleaned_competitor_name or cleaned_competitor_name in cleaned_source_name
330
331 # Extract address and parse components
332 address = extract_address(lines)
333 address_components = parse_address_components(address)
334
335 competitor = {
336 'place_id': ids.get('place_id'),
337 'business_name': business_name,
338 'google_rank': len(competitors) + 1,
339 'is_source_business': is_source,
340 'average_rating': extract_rating(lines),
341 'number_of_reviews': extract_review_count(lines),
342 'main_category': determine_category(lines),
343 'address': address,
344 'city': address_components['city'],
345 'state': address_components['state'],
346 'zipcode': address_components['zipcode']
347 }
348 competitors.append(competitor)
349
350 return competitors
351
352# ======================================================================
353# MAIN ACTOR FUNCTION
354# ======================================================================
355async def main() -> None:
356 async with Actor:
357 actor_input = await Actor.get_input() or {}
358
359 # Parse input from new schema
360 locations_data = actor_input.get('locations', [])
361 search_query = actor_input.get('searchQuery', 'medical spa')
362 max_competitors = actor_input.get('maxCompetitors', 30)
363 proxy_configuration = actor_input.get('proxyConfiguration')
364
365 # Log input for debugging
366 Actor.log.info(f"Received input: {len(locations_data)} locations")
367 Actor.log.info(f"Search query: {search_query}")
368 Actor.log.info(f"Max competitors per location: {max_competitors}")
369
370 # Convert key-value pairs to locations and business names
371 locations = []
372 business_names = []
373 for item in locations_data:
374 if isinstance(item, dict):
375 locations.append(item.get('key', ''))
376 business_names.append(item.get('value', ''))
377 else:
378 # Fallback for string input
379 locations.append(str(item))
380 business_names.append('')
381
382 if not locations:
383 Actor.log.error("No locations provided. Please add at least one location.")
384 await Actor.push_data({
385 'error': 'No locations provided',
386 'message': 'Please provide at least one location (address, Place ID, or coordinates)'
387 })
388 return
389
390 Actor.log.info(f"Processing {len(locations)} locations...")
391 Actor.log.info(f"Search query: '{search_query}'")
392 Actor.log.info(f"Max competitors: {max_competitors}")
393
394 # Track total results pushed
395 total_results = 0
396
397 # Launch browser
398 async with async_playwright() as p:
399 # Handle proxy if configured
400 browser_options = {"headless": True}
401
402 if proxy_configuration and proxy_configuration.get('useApifyProxy'):
403 try:
404 # For Apify proxy, you might need to check the correct method
405 # This is a placeholder - check Apify docs for your SDK version
406 Actor.log.info("Proxy requested but implementation needs to be verified for your Apify SDK version")
407 except Exception as e:
408 Actor.log.warning(f"Could not set up proxy: {e}")
409
410 Actor.log.info("Launching browser...")
411 browser = await p.chromium.launch(**browser_options)
412 page = await browser.new_page()
413
414 # Process each location
415 for idx, location_str in enumerate(locations):
416 # Get business name from the paired value
417 business_name = business_names[idx] if idx < len(business_names) else None
418
419 # Skip empty locations
420 if not location_str or not location_str.strip():
421 continue
422
423 # Parse location type
424 location_type, location_data = parse_location_type(location_str)
425
426 Actor.log.info(f"\nProcessing location {idx + 1}/{len(locations)}: {location_str}")
427 if business_name:
428 Actor.log.info(f"Business name: {business_name}")
429
430 geo = {'found': False}
431
432 try:
433 if location_type == 'coordinates':
434 lat, lng = location_data
435 Actor.log.info(f"Using provided coordinates: {lat}, {lng}")
436 geo = {'found': True, 'latitude': lat, 'longitude': lng}
437 elif location_type == 'place_id':
438 geo = await geocode_with_place_id(page, location_data)
439 else: # address
440 geo = await geocode_with_address(page, location_data)
441 except Exception as e:
442 Actor.log.exception(f"Geocoding failed: {e}")
443 geo = {'found': False, 'error': str(e)}
444
445 # Scrape competitors if geocoding succeeded
446 competitors_list = []
447 if geo.get('found'):
448 try:
449 competitors_list = await scrape_competitors_at_coordinates(
450 page,
451 business_name,
452 geo['latitude'],
453 geo['longitude'],
454 search_query,
455 max_competitors
456 )
457 Actor.log.info(f"Found {len(competitors_list)} competitors for {business_name or location_str}")
458 except Exception as e:
459 Actor.log.exception(f"Competitor scraping failed: {e}")
460 else:
461 Actor.log.warning(f"Skipping competitor search - geocoding failed: {geo.get('error', 'Unknown error')}")
462
463 # Push results - FLATTENED for CSV
464 if competitors_list:
465 # One row per competitor
466 for competitor in competitors_list:
467 result = {
468 # Competitor details in requested order
469 'competitor_name': competitor['business_name'],
470 'competitor_address': competitor['address'] or '',
471 'competitor_city': competitor['city'] or '',
472 'competitor_state': competitor['state'] or '',
473 'competitor_zipcode': competitor['zipcode'] or '',
474 'competitor_category': competitor['main_category'],
475 'competitor_rank': competitor['google_rank'],
476 'competitor_rating': competitor['average_rating'],
477 'competitor_reviews': competitor['number_of_reviews'],
478 'competitor_place_id': competitor['place_id'] or '',
479 'latitude': geo.get('latitude'),
480 'longitude': geo.get('longitude'),
481 'search_query': search_query,
482 # Source business info at the end
483 'is_source_business': competitor['is_source_business'],
484 'input_location': location_str,
485 'input_business_name': business_name or ''
486 }
487 await Actor.push_data(result)
488 total_results += 1
489 else:
490 # No competitors found - still push location info
491 result = {
492 # Empty competitor details in requested order
493 'competitor_name': '',
494 'competitor_address': '',
495 'competitor_city': '',
496 'competitor_state': '',
497 'competitor_zipcode': '',
498 'competitor_category': '',
499 'competitor_rank': None,
500 'competitor_rating': None,
501 'competitor_reviews': None,
502 'competitor_place_id': '',
503 'latitude': geo.get('latitude'),
504 'longitude': geo.get('longitude'),
505 'search_query': search_query,
506 # Source business info at the end
507 'is_source_business': None,
508 'input_location': location_str,
509 'input_business_name': business_name or ''
510 }
511 await Actor.push_data(result)
512 total_results += 1
513
514 await browser.close()
515 Actor.log.info(f"\nScraping completed! Processed {len(locations)} locations and found {total_results} total results.")
516# Entry point
517if __name__ == "__main__":
518 asyncio.run(main())

src/py.typed

src/__init__.py

1

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())