google-search
Pricing
Pay per usage
Go to Store
google-search
0.0 (0)
Pricing
Pay per usage
3
Total users
401
Monthly users
18
Runs succeeded
>99%
Last modified
2 years ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "getting-started-python-parsel", "title": "Getting started with Python and parsel", "description": "aiohttp/parsel parse google search.", "version": "0.2", "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "search result", "views": { "overview": { "title": "overview", "transformation": { "fields": [ "now", "search_result_count", "items", "top_stories", "meta", "query" ] }, "display": { "component": "table", "properties": { "url": { "label": "now", "format": "number" }, "search_result_count": { "label": "search_result_count", "format": "number" }, "items": { "label": "items", "format": "array" }, "top_stories": { "label": "top_stories", "format": "array" }, "meta": { "label": "meta", "format": "object" }, "query": { "label": "query", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Python BeautifulSoup Scraper", "type": "object", "schemaVersion": 1, "properties": { "query": { "title": "query", "type": "string", "description": "query", "prefill": "trump", "editor": "textfield" } }, "required": ["query"]}
src/__init__.py
1
src/__main__.py
1import asyncio2import logging3
4from apify.log import ActorLogFormatter5
6from .main import main7
8handler = logging.StreamHandler()9handler.setFormatter(ActorLogFormatter())10
11apify_client_logger = logging.getLogger('apify_client')12apify_client_logger.setLevel(logging.INFO)13apify_client_logger.addHandler(handler)14
15apify_logger = logging.getLogger('apify')16apify_logger.setLevel(logging.DEBUG)17apify_logger.addHandler(handler)18
19asyncio.run(main())
src/main.py
1from urllib.parse import urljoin2
3from apify import Actor4
5import os6import sys7import re8import time9import datetime10import asyncio11import logging12import aiohttp13import parsel14
15
16logger = logging.getLogger('google_search')17
18
19def now():20 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())21
22
23def now_ts():24 return int(datetime.datetime.utcnow().timestamp())25
26
27def now_minute():28 return datetime.datetime.utcnow().minute29
30
31def pjson(m):32 print(json.dumps(m, indent=4, ensure_ascii=False))33
34
35
36class HttpClient:37 def __init__(self, **kwargs):38 self._client = None39 self._kwargs = kwargs40
41 def _after_new_client(self):42 pass43
44 async def __aenter__(self):45 self._client = await aiohttp.ClientSession(**self._kwargs).__aenter__()46 self._after_new_client()47 return self48
49 async def __aexit__(self, exc_type, exc, tb):50 await self._client.__aexit__(exc_type, exc, tb)51
52
53
54class GoogleSearch(HttpClient):55 def __init__(self, **kwargs):56 super().__init__(**kwargs)57 self._proxy = None58 self._headers = {59 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',60 }61 self._now = now()62
63 async def _get_proxy(self):64 if self._proxy:65 return self._proxy66 return None67
68 def _format_search_url(self, query, start=0, num=10, hl='en'):69 return {70 'q': query,71 'start': start,72 'num': num,73 'hl': hl,74 }75
76 async def search(self, query, meta=None, info={}, times=3, **kwargs):77 if meta is None:78 meta = {}79 proxy = await self._get_proxy()80 search_url = 'https://www.google.com/search'81 params = self._format_search_url(query, **kwargs)82 docid = '' if not meta else meta['docid']83 try:84 logger.info(f'start download {docid} "{query}" proxy={proxy}')85 resp = await self._client.get(search_url, params=params, proxy=proxy, headers=self._headers, timeout=10)86 logger.info(f'finish download {docid} "{query}" proxy={proxy} status={resp.status}')87 if resp.status == 429:88 times -= 189 if times > 0:90 s = 2 ** (4 - times)91 logger.info(f'429 retry sleep {s}')92 await asyncio.sleep(s)93 if info:94 info['retry'] += 195 return await self.search(query=query, meta=meta, info=info, times=times, **kwargs)96 meta['status_code'] = resp.status97 html = await resp.text()98 except Exception as e:99 meta['error'] = e100 if 'error' in meta or resp.status != 200:101 return {102 'meta': meta,103 }104 result = await self.parse(html)105 result['meta'] = meta106 result['query'] = query107 return result108
109 async def parse(self, html):110 now = now_ts()111 selector = parsel.Selector(html)112 result = {113 'now': now,114 }115 about_results = selector.xpath('//div[@id="result-stats"]/text()').get()116 count = self._parse_about_results(about_results)117 if count:118 result['search_result_count'] = count119 for section in selector.xpath('//g-section-with-header'):120 title = section.xpath('..//..//*[@role="heading"]/text()').get()121 if isinstance(title, str):122 titile = title.strip()123 if not title:124 continue125 if title == "Top stories":126 docs = []127 for a in section.xpath('.//a'):128 heading_elem = a.xpath('.//div[@role="heading"]')129 heading = heading_elem.xpath('./text()').get()130 if not heading:131 continue132 heading = self._normalize_title(heading)133 if not heading:134 continue135 href = a.xpath('./@href').get()136 if not href:137 continue138 doc = {139 'url': href,140 'title': heading,141 }142 publish_time = heading_elem.xpath('./following-sibling::div[1]/span/text()').get()143 if publish_time:144 doc['publish_time_ago'] = publish_time145 docs.append(doc)146 if docs:147 result['top_stories'] = docs148
149 items = []150 for section in selector.xpath('//div[@data-sokoban-container]'):151 a = section.xpath('.//a')152 url = a.xpath('./@href').get()153 item = {}154 if not url:155 continue156 title = a.xpath('./h3/text()').get()157 if not title:158 continue159 item['url'] = url160 item['title'] = title161 description_block = section.xpath('.//div[@data-content-feature="1"]')162 publish_time_block = description_block.xpath('.//span[@class="MUxGbd wuQ4Ob WZ8Tjf"]')163 publish_time_ago = publish_time_block.xpath('./span/text()').get()164 if publish_time_ago:165 item['publish_time_ago'] = publish_time_ago166 description = ''.join(description_block.xpath('./div/span[last()]//text()').getall()).strip()167 item['description'] = description168
169 items.append(item)170 result['items'] = items171 return result172
173 def _normalize_title(self, title):174 return re.sub(r'\s+', ' ', title).strip()175
176 def _parse_about_results(self, msg):177 if not msg:178 return None179 result = re.match(r'About ([0-9,]+) results', msg)180 if not result:181 return182 return int(result.group(1).replace(',', ''))183
184
185async def main():186 async with Actor:187 # Read the Actor input188 actor_input = await Actor.get_input() or {}189 query = actor_input.get('query', 'trump')190
191 async with GoogleSearch() as gs:192 result = await gs.search(query)193
194 await Actor.push_data(result)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache
.scrapy*.log
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/3# for how to format them4apify ~= 1.0.05beautifulsoup4 ~= 4.9.06requests ~= 2.28.27aiohttp8parsel