1from urllib.parse import urljoin
2
3from apify import Actor
4
5import os
6import sys
7import re
8import time
9import datetime
10import asyncio
11import logging
12import aiohttp
13import parsel
14
15
16logger = logging.getLogger('google_search')
17
18
19def now():
20 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
21
22
23def now_ts():
24 return int(datetime.datetime.utcnow().timestamp())
25
26
27def now_minute():
28 return datetime.datetime.utcnow().minute
29
30
31def pjson(m):
32 print(json.dumps(m, indent=4, ensure_ascii=False))
33
34
35
36class HttpClient:
37 def __init__(self, **kwargs):
38 self._client = None
39 self._kwargs = kwargs
40
41 def _after_new_client(self):
42 pass
43
44 async def __aenter__(self):
45 self._client = await aiohttp.ClientSession(**self._kwargs).__aenter__()
46 self._after_new_client()
47 return self
48
49 async def __aexit__(self, exc_type, exc, tb):
50 await self._client.__aexit__(exc_type, exc, tb)
51
52
53
54class GoogleSearch(HttpClient):
55 def __init__(self, **kwargs):
56 super().__init__(**kwargs)
57 self._proxy = None
58 self._headers = {
59 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
60 }
61 self._now = now()
62
63 async def _get_proxy(self):
64 if self._proxy:
65 return self._proxy
66 return None
67
68 def _format_search_url(self, query, start=0, num=10, hl='en'):
69 return {
70 'q': query,
71 'start': start,
72 'num': num,
73 'hl': hl,
74 }
75
76 async def search(self, query, meta=None, info={}, times=3, **kwargs):
77 if meta is None:
78 meta = {}
79 proxy = await self._get_proxy()
80 search_url = 'https://www.google.com/search'
81 params = self._format_search_url(query, **kwargs)
82 docid = '' if not meta else meta['docid']
83 try:
84 logger.info(f'start download {docid} "{query}" proxy={proxy}')
85 resp = await self._client.get(search_url, params=params, proxy=proxy, headers=self._headers, timeout=10)
86 logger.info(f'finish download {docid} "{query}" proxy={proxy} status={resp.status}')
87 if resp.status == 429:
88 times -= 1
89 if times > 0:
90 s = 2 ** (4 - times)
91 logger.info(f'429 retry sleep {s}')
92 await asyncio.sleep(s)
93 if info:
94 info['retry'] += 1
95 return await self.search(query=query, meta=meta, info=info, times=times, **kwargs)
96 meta['status_code'] = resp.status
97 html = await resp.text()
98 except Exception as e:
99 meta['error'] = e
100 if 'error' in meta or resp.status != 200:
101 return {
102 'meta': meta,
103 }
104 result = await self.parse(html)
105 result['meta'] = meta
106 result['query'] = query
107 return result
108
109 async def parse(self, html):
110 now = now_ts()
111 selector = parsel.Selector(html)
112 result = {
113 'now': now,
114 }
115 about_results = selector.xpath('//div[@id="result-stats"]/text()').get()
116 count = self._parse_about_results(about_results)
117 if count:
118 result['search_result_count'] = count
119 for section in selector.xpath('//g-section-with-header'):
120 title = section.xpath('..//..//*[@role="heading"]/text()').get()
121 if isinstance(title, str):
122 titile = title.strip()
123 if not title:
124 continue
125 if title == "Top stories":
126 docs = []
127 for a in section.xpath('.//a'):
128 heading_elem = a.xpath('.//div[@role="heading"]')
129 heading = heading_elem.xpath('./text()').get()
130 if not heading:
131 continue
132 heading = self._normalize_title(heading)
133 if not heading:
134 continue
135 href = a.xpath('./@href').get()
136 if not href:
137 continue
138 doc = {
139 'url': href,
140 'title': heading,
141 }
142 publish_time = heading_elem.xpath('./following-sibling::div[1]/span/text()').get()
143 if publish_time:
144 doc['publish_time_ago'] = publish_time
145 docs.append(doc)
146 if docs:
147 result['top_stories'] = docs
148
149 items = []
150 for section in selector.xpath('//div[@data-sokoban-container]'):
151 a = section.xpath('.//a')
152 url = a.xpath('./@href').get()
153 item = {}
154 if not url:
155 continue
156 title = a.xpath('./h3/text()').get()
157 if not title:
158 continue
159 item['url'] = url
160 item['title'] = title
161 description_block = section.xpath('.//div[@data-content-feature="1"]')
162 publish_time_block = description_block.xpath('.//span[@class="MUxGbd wuQ4Ob WZ8Tjf"]')
163 publish_time_ago = publish_time_block.xpath('./span/text()').get()
164 if publish_time_ago:
165 item['publish_time_ago'] = publish_time_ago
166 description = ''.join(description_block.xpath('./div/span[last()]//text()').getall()).strip()
167 item['description'] = description
168
169 items.append(item)
170 result['items'] = items
171 return result
172
173 def _normalize_title(self, title):
174 return re.sub(r'\s+', ' ', title).strip()
175
176 def _parse_about_results(self, msg):
177 if not msg:
178 return None
179 result = re.match(r'About ([0-9,]+) results', msg)
180 if not result:
181 return
182 return int(result.group(1).replace(',', ''))
183
184
185async def main():
186 async with Actor:
187 # Read the Actor input
188 actor_input = await Actor.get_input() or {}
189 query = actor_input.get('query', 'trump')
190
191 async with GoogleSearch() as gs:
192 result = await gs.search(query)
193
194 await Actor.push_data(result)