google-search
Try for free
No credit card required
Go to Store
google-search
parse/google-search
Try for free
No credit card required
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "getting-started-python-parsel",
4 "title": "Getting started with Python and parsel",
5 "description": "aiohttp/parsel parse google search.",
6 "version": "0.2",
7 "input": "./input_schema.json",
8 "dockerfile": "./Dockerfile",
9 "storages": {
10 "dataset": {
11 "actorSpecification": 1,
12 "title": "search result",
13 "views": {
14 "overview": {
15 "title": "overview",
16 "transformation": {
17 "fields": [
18 "now",
19 "search_result_count",
20 "items",
21 "top_stories",
22 "meta",
23 "query"
24 ]
25 },
26 "display": {
27 "component": "table",
28 "properties": {
29 "url": {
30 "label": "now",
31 "format": "number"
32 },
33 "search_result_count": {
34 "label": "search_result_count",
35 "format": "number"
36 },
37 "items": {
38 "label": "items",
39 "format": "array"
40 },
41 "top_stories": {
42 "label": "top_stories",
43 "format": "array"
44 },
45 "meta": {
46 "label": "meta",
47 "format": "object"
48 },
49 "query": {
50 "label": "query",
51 "format": "text"
52 }
53 }
54 }
55 }
56 }
57 }
58 }
59}
.actor/input_schema.json
1{
2 "title": "Python BeautifulSoup Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "query": {
7 "title": "query",
8 "type": "string",
9 "description": "query",
10 "prefill": "trump",
11 "editor": "textfield"
12 }
13 },
14 "required": ["query"]
15}
src/__init__.py
src/__main__.py
1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8handler = logging.StreamHandler()
9handler.setFormatter(ActorLogFormatter())
10
11apify_client_logger = logging.getLogger('apify_client')
12apify_client_logger.setLevel(logging.INFO)
13apify_client_logger.addHandler(handler)
14
15apify_logger = logging.getLogger('apify')
16apify_logger.setLevel(logging.DEBUG)
17apify_logger.addHandler(handler)
18
19asyncio.run(main())
src/main.py
1from urllib.parse import urljoin
2
3from apify import Actor
4
5import os
6import sys
7import re
8import time
9import datetime
10import asyncio
11import logging
12import aiohttp
13import parsel
14
15
16logger = logging.getLogger('google_search')
17
18
19def now():
20 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
21
22
23def now_ts():
24 return int(datetime.datetime.utcnow().timestamp())
25
26
27def now_minute():
28 return datetime.datetime.utcnow().minute
29
30
31def pjson(m):
32 print(json.dumps(m, indent=4, ensure_ascii=False))
33
34
35
36class HttpClient:
37 def __init__(self, **kwargs):
38 self._client = None
39 self._kwargs = kwargs
40
41 def _after_new_client(self):
42 pass
43
44 async def __aenter__(self):
45 self._client = await aiohttp.ClientSession(**self._kwargs).__aenter__()
46 self._after_new_client()
47 return self
48
49 async def __aexit__(self, exc_type, exc, tb):
50 await self._client.__aexit__(exc_type, exc, tb)
51
52
53
54class GoogleSearch(HttpClient):
55 def __init__(self, **kwargs):
56 super().__init__(**kwargs)
57 self._proxy = None
58 self._headers = {
59 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
60 }
61 self._now = now()
62
63 async def _get_proxy(self):
64 if self._proxy:
65 return self._proxy
66 return None
67
68 def _format_search_url(self, query, start=0, num=10, hl='en'):
69 return {
70 'q': query,
71 'start': start,
72 'num': num,
73 'hl': hl,
74 }
75
76 async def search(self, query, meta=None, info={}, times=3, **kwargs):
77 if meta is None:
78 meta = {}
79 proxy = await self._get_proxy()
80 search_url = 'https://www.google.com/search'
81 params = self._format_search_url(query, **kwargs)
82 docid = '' if not meta else meta['docid']
83 try:
84 logger.info(f'start download {docid} "{query}" proxy={proxy}')
85 resp = await self._client.get(search_url, params=params, proxy=proxy, headers=self._headers, timeout=10)
86 logger.info(f'finish download {docid} "{query}" proxy={proxy} status={resp.status}')
87 if resp.status == 429:
88 times -= 1
89 if times > 0:
90 s = 2 ** (4 - times)
91 logger.info(f'429 retry sleep {s}')
92 await asyncio.sleep(s)
93 if info:
94 info['retry'] += 1
95 return await self.search(query=query, meta=meta, info=info, times=times, **kwargs)
96 meta['status_code'] = resp.status
97 html = await resp.text()
98 except Exception as e:
99 meta['error'] = e
100 if 'error' in meta or resp.status != 200:
101 return {
102 'meta': meta,
103 }
104 result = await self.parse(html)
105 result['meta'] = meta
106 result['query'] = query
107 return result
108
109 async def parse(self, html):
110 now = now_ts()
111 selector = parsel.Selector(html)
112 result = {
113 'now': now,
114 }
115 about_results = selector.xpath('//div[@id="result-stats"]/text()').get()
116 count = self._parse_about_results(about_results)
117 if count:
118 result['search_result_count'] = count
119 for section in selector.xpath('//g-section-with-header'):
120 title = section.xpath('..//..//*[@role="heading"]/text()').get()
121 if isinstance(title, str):
122 titile = title.strip()
123 if not title:
124 continue
125 if title == "Top stories":
126 docs = []
127 for a in section.xpath('.//a'):
128 heading_elem = a.xpath('.//div[@role="heading"]')
129 heading = heading_elem.xpath('./text()').get()
130 if not heading:
131 continue
132 heading = self._normalize_title(heading)
133 if not heading:
134 continue
135 href = a.xpath('./@href').get()
136 if not href:
137 continue
138 doc = {
139 'url': href,
140 'title': heading,
141 }
142 publish_time = heading_elem.xpath('./following-sibling::div[1]/span/text()').get()
143 if publish_time:
144 doc['publish_time_ago'] = publish_time
145 docs.append(doc)
146 if docs:
147 result['top_stories'] = docs
148
149 items = []
150 for section in selector.xpath('//div[@data-sokoban-container]'):
151 a = section.xpath('.//a')
152 url = a.xpath('./@href').get()
153 item = {}
154 if not url:
155 continue
156 title = a.xpath('./h3/text()').get()
157 if not title:
158 continue
159 item['url'] = url
160 item['title'] = title
161 description_block = section.xpath('.//div[@data-content-feature="1"]')
162 publish_time_block = description_block.xpath('.//span[@class="MUxGbd wuQ4Ob WZ8Tjf"]')
163 publish_time_ago = publish_time_block.xpath('./span/text()').get()
164 if publish_time_ago:
165 item['publish_time_ago'] = publish_time_ago
166 description = ''.join(description_block.xpath('./div/span[last()]//text()').getall()).strip()
167 item['description'] = description
168
169 items.append(item)
170 result['items'] = items
171 return result
172
173 def _normalize_title(self, title):
174 return re.sub(r'\s+', ' ', title).strip()
175
176 def _parse_about_results(self, msg):
177 if not msg:
178 return None
179 result = re.match(r'About ([0-9,]+) results', msg)
180 if not result:
181 return
182 return int(result.group(1).replace(',', ''))
183
184
185async def main():
186 async with Actor:
187 # Read the Actor input
188 actor_input = await Actor.get_input() or {}
189 query = actor_input.get('query', 'trump')
190
191 async with GoogleSearch() as gs:
192 result = await gs.search(query)
193
194 await Actor.push_data(result)
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23
24.scrapy
25*.log
requirements.txt
1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.0.0
5beautifulsoup4 ~= 4.9.0
6requests ~= 2.28.2
7aiohttp
8parsel
Developer
Maintained by Community
Actor Metrics
10 monthly users
-
1 star
>99% runs succeeded
Created in Mar 2023
Modified 2 years ago
Categories