1"""
2Google Search Results API - Real-time SERP Scraping
3Scrapes Google search results with TLS fingerprint bypass and proxy support.
4"""
5from __future__ import annotations
6
7import asyncio
8import json
9import os
10import random
11import re
12from datetime import datetime, timezone
13from typing import Any, List
14from urllib.parse import quote_plus, unquote
15
16try:
17 from curl_cffi import requests
18 HAS_CURL = True
19except ImportError:
20 import requests
21 HAS_CURL = False
22
23from apify import Actor
24
25
26USER_AGENTS = [
27 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
28 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15',
29 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
30]
31
32
33def sanitize_text(text: str, max_len: int = 500) -> str:
34 if not text: return ''
35 return re.sub(r'\s+', ' ', text.strip())[:max_len]
36
37
38def extract_url(href: str) -> str:
39 if not href: return ''
40 m = re.search(r'/url\?q=([^&]+)', href)
41 if m: return unquote(m.group(1))
42 m = re.search(r'url=([^&]+)', href)
43 if m: return unquote(m.group(1))
44 return href if href.startswith('http') else ''
45
46
47def parse_results_soup(html: str) -> List[dict[str, Any]]:
48 """Parse Google search results using regex for speed and resilience."""
49 results = []
50 seen_urls = set()
51
52
53 blocks = re.findall(
54 r'<div[^>]*class="[^"]*g[^"]*"[^>]*>.*?</div>\s*</div>\s*</div>',
55 html, re.DOTALL
56 )[:50]
57
58 for block in blocks:
59 h3 = re.search(r'<h3[^>]*>(.*?)</h3>', block, re.DOTALL)
60 link = re.search(r'<a[^>]+href="([^"]+)"[^>]*>', block, re.DOTALL)
61 if not (h3 and link): continue
62 title = sanitize_text(re.sub(r'<[^>]+>', '', h3.group(1)))
63 url = extract_url(link.group(1))
64 if not url or not title or url in seen_urls: continue
65 seen_urls.add(url)
66
67
68 snippet = ''
69 snip = re.search(r'<div[^>]*class="[^"]*(?:VwiC3b|lEBKkf)[^"]*"[^>]*>(.*?)</div>', block, re.DOTALL)
70 if snip: snippet = sanitize_text(re.sub(r'<[^>]+>', '', snip.group(1)))
71 if not snippet:
72 snip2 = re.search(r'<span[^>]*class="[^"]*(?:aCOpRe|st)[^"]*"[^>]*>(.*?)</span>', block, re.DOTALL)
73 if snip2: snippet = sanitize_text(re.sub(r'<[^>]+>', '', snip2.group(1)))
74
75 results.append({'title': title, 'url': url, 'snippet': snippet})
76
77
78 if not results:
79 for match in re.finditer(
80 r'<a[^>]+href="(https?://[^"]+)"[^>]*><h3[^>]*>(.*?)</h3></a>',
81 html
82 ):
83 url = match.group(1)
84 title = sanitize_text(re.sub(r'<[^>]+>', '', match.group(2)))
85 if url and title and url not in seen_urls:
86 seen_urls.add(url)
87 results.append({'title': title, 'url': url, 'snippet': ''})
88
89 return results[:30]
90
91
92async def scrape_google(query: str, num_results: int = 20) -> List[dict[str, Any]]:
93 results = []
94 offset = 0
95 max_pages = 3
96
97 while len(results) < num_results and offset < max_pages:
98 start = offset * 10
99 search_url = f"https://www.google.com/search?q={quote_plus(query)}&num=10&start={start}&hl=en"
100 headers = {
101 'User-Agent': random.choice(USER_AGENTS),
102 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
103 'Accept-Language': 'en-US,en;q=0.9',
104 'Referer': 'https://www.google.com/',
105 'Cache-Control': 'no-cache',
106 }
107 retries = 2
108 page_ok = False
109 while retries > 0 and not page_ok:
110 try:
111 if HAS_CURL:
112 resp = requests.get(search_url, headers=headers, impersonate='chrome131', timeout=20)
113 else:
114 resp = requests.get(search_url, headers=headers, timeout=20)
115 if resp.status_code == 200:
116 page = parse_results_soup(resp.text)
117 if page:
118 for r in page:
119 if r['url'] not in {x['url'] for x in results}:
120 results.append(r)
121 page_ok = True
122 Actor.log.info(f"Page {offset+1}: {len(page)} results")
123 else:
124 Actor.log.warning(f"Page {offset+1}: no results parsed")
125 retries -= 1
126 await asyncio.sleep(3)
127 elif resp.status_code == 429:
128 Actor.log.warning(f"429 rate limited, retrying...")
129 await asyncio.sleep(5 * (3 - retries + 1))
130 retries -= 1
131 else:
132 Actor.log.warning(f"HTTP {resp.status_code}")
133 retries -= 1
134 await asyncio.sleep(2)
135 except Exception as e:
136 Actor.log.error(f"Request failed: {e}")
137 retries -= 1
138 await asyncio.sleep(3)
139 if not page_ok:
140 break
141 offset += 1
142 await asyncio.sleep(2)
143
144 return results[:num_results]
145
146
147async def main() -> None:
148 async with Actor:
149 inp = await Actor.get_input() or {}
150 Actor.log.info(f"Input: {json.dumps(inp)}")
151 payload = inp.get('input', inp) if isinstance(inp, dict) else inp
152 query = (payload if isinstance(payload, str) else payload.get('query', inp.get('query', ''))) or ''
153 num = int(payload.get('num_results', payload.get('maxResults', payload.get('limit', 20))))
154
155 if not query:
156 Actor.log.error('No query provided')
157 await Actor.push_data({
158 'error': True, 'message': 'query field is required',
159 'received_input': str(inp)[:200], 'source': 'google',
160 'scrapedAt': datetime.now(timezone.utc).isoformat()
161 })
162 return
163
164 Actor.log.info(f'Query: "{query}" max={num}')
165 results = await scrape_google(query, num)
166
167 for i, r in enumerate(results, 1):
168 await Actor.push_data({
169 'position': i, 'title': r['title'], 'url': r['url'],
170 'snippet': r['snippet'], 'query': query, 'source': 'google',
171 'scrapedAt': datetime.now(timezone.utc).isoformat()
172 })
173
174 Actor.log.info(f'Returned {len(results)} results')
175
176
177if __name__ == '__main__':
178 asyncio.run(main())