google-search avatar
google-search
Try for free

No credit card required

View all Actors
google-search

google-search

parse/google-search
Try for free

No credit card required

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "getting-started-python-parsel",
4    "title": "Getting started with Python and parsel",
5    "description": "aiohttp/parsel parse google search.",
6    "version": "0.2",
7    "input": "./input_schema.json",
8    "dockerfile": "./Dockerfile",
9    "storages": {
10        "dataset": {
11            "actorSpecification": 1,
12            "title": "search result",
13            "views": {
14                "overview": {
15                    "title": "overview",
16                    "transformation": {
17                        "fields": [
18                            "now",
19                            "search_result_count",
20                            "items",
21                            "top_stories",
22                            "meta",
23                            "query"
24                        ]
25                    },
26                    "display": {
27                        "component": "table",
28                        "properties": {
29                            "url": {
30                                "label": "now",
31                                "format": "number"
32                            },
33                            "search_result_count": {
34                                "label": "search_result_count",
35                                "format": "number"
36                            },
37                            "items": {
38                                "label": "items",
39                                "format": "array"
40                            },
41                            "top_stories": {
42                                "label": "top_stories",
43                                "format": "array"
44                            },
45                            "meta": {
46                                "label": "meta",
47                                "format": "object"
48                            },
49                            "query": {
50                                "label": "query",
51                                "format": "text"
52                            }
53                        }
54                    }
55                }
56            }
57        }
58    }
59}

.actor/input_schema.json

1{
2    "title": "Python BeautifulSoup Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "query": {
7            "title": "query",
8            "type": "string",
9            "description": "query",
10            "prefill": "trump",
11            "editor": "textfield"
12        }
13    },
14    "required": ["query"]
15}

src/__init__.py

src/__main__.py

1import asyncio
2import logging
3
4from apify.log import ActorLogFormatter
5
6from .main import main
7
8handler = logging.StreamHandler()
9handler.setFormatter(ActorLogFormatter())
10
11apify_client_logger = logging.getLogger('apify_client')
12apify_client_logger.setLevel(logging.INFO)
13apify_client_logger.addHandler(handler)
14
15apify_logger = logging.getLogger('apify')
16apify_logger.setLevel(logging.DEBUG)
17apify_logger.addHandler(handler)
18
19asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2
3from apify import Actor
4
5import os
6import sys
7import re
8import time
9import datetime
10import asyncio
11import logging
12import aiohttp
13import parsel
14
15
16logger = logging.getLogger('google_search')
17
18
19def now():
20    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
21
22
23def now_ts():
24    return int(datetime.datetime.utcnow().timestamp())
25
26
27def now_minute():
28    return datetime.datetime.utcnow().minute
29
30
31def pjson(m):
32    print(json.dumps(m, indent=4, ensure_ascii=False))
33
34
35
36class HttpClient:
37    def __init__(self, **kwargs):
38        self._client = None
39        self._kwargs = kwargs
40
41    def _after_new_client(self):
42        pass
43
44    async def __aenter__(self):
45        self._client = await aiohttp.ClientSession(**self._kwargs).__aenter__()
46        self._after_new_client()
47        return self
48
49    async def __aexit__(self, exc_type, exc, tb):
50        await self._client.__aexit__(exc_type, exc, tb)
51
52
53
54class GoogleSearch(HttpClient):
55    def __init__(self, **kwargs):
56        super().__init__(**kwargs)
57        self._proxy = None
58        self._headers = {
59            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
60        }
61        self._now = now()
62
63    async def _get_proxy(self):
64        if self._proxy:
65            return self._proxy
66        return None
67
68    def _format_search_url(self, query, start=0, num=10, hl='en'):
69        return {
70            'q': query,
71            'start': start,
72            'num': num,
73            'hl': hl,
74        }
75
76    async def search(self, query, meta=None, info={}, times=3, **kwargs):
77        if meta is None:
78            meta = {}
79        proxy = await self._get_proxy()
80        search_url = 'https://www.google.com/search'
81        params = self._format_search_url(query, **kwargs)
82        docid = '' if not meta else meta['docid']
83        try:
84            logger.info(f'start download {docid} "{query}" proxy={proxy}')
85            resp = await self._client.get(search_url, params=params, proxy=proxy, headers=self._headers, timeout=10)
86            logger.info(f'finish download {docid} "{query}" proxy={proxy} status={resp.status}')
87            if resp.status == 429:
88                times -= 1
89                if times > 0:
90                    s = 2 ** (4 - times)
91                    logger.info(f'429 retry sleep {s}')
92                    await asyncio.sleep(s)
93                    if info:
94                        info['retry'] += 1
95                    return await self.search(query=query, meta=meta, info=info, times=times, **kwargs)
96            meta['status_code'] = resp.status
97            html = await resp.text()
98        except Exception as e:
99            meta['error'] = e
100        if 'error' in meta or resp.status != 200:
101            return {
102                'meta': meta,
103            }
104        result = await self.parse(html)
105        result['meta'] = meta
106        result['query'] = query
107        return result
108
109    async def parse(self, html):
110        now = now_ts()
111        selector = parsel.Selector(html)
112        result = {
113            'now': now,
114        }
115        about_results = selector.xpath('//div[@id="result-stats"]/text()').get()
116        count = self._parse_about_results(about_results)
117        if count:
118            result['search_result_count'] = count
119        for section in selector.xpath('//g-section-with-header'):
120            title = section.xpath('..//..//*[@role="heading"]/text()').get()
121            if isinstance(title, str):
122                titile = title.strip()
123            if not title:
124                continue
125            if title == "Top stories":
126                docs = []
127                for a in section.xpath('.//a'):
128                    heading_elem = a.xpath('.//div[@role="heading"]')
129                    heading = heading_elem.xpath('./text()').get()
130                    if not heading:
131                        continue
132                    heading = self._normalize_title(heading)
133                    if not heading:
134                        continue
135                    href = a.xpath('./@href').get()
136                    if not href:
137                        continue
138                    doc = {
139                        'url': href,
140                        'title': heading,
141                    }
142                    publish_time = heading_elem.xpath('./following-sibling::div[1]/span/text()').get()
143                    if publish_time:
144                        doc['publish_time_ago'] = publish_time
145                    docs.append(doc)
146                if docs:
147                    result['top_stories'] = docs
148
149        items = []
150        for section in selector.xpath('//div[@data-sokoban-container]'):
151            a = section.xpath('.//a')
152            url = a.xpath('./@href').get()
153            item = {}
154            if not url:
155                continue
156            title = a.xpath('./h3/text()').get()
157            if not title:
158                continue
159            item['url'] = url
160            item['title'] = title
161            description_block = section.xpath('.//div[@data-content-feature="1"]')
162            publish_time_block = description_block.xpath('.//span[@class="MUxGbd wuQ4Ob WZ8Tjf"]')
163            publish_time_ago = publish_time_block.xpath('./span/text()').get()
164            if publish_time_ago:
165                item['publish_time_ago'] = publish_time_ago
166            description = ''.join(description_block.xpath('./div/span[last()]//text()').getall()).strip()
167            item['description'] = description
168
169            items.append(item)
170        result['items'] = items
171        return result
172
173    def _normalize_title(self, title):
174        return re.sub(r'\s+', ' ', title).strip()
175
176    def _parse_about_results(self, msg):
177        if not msg:
178            return None
179        result = re.match(r'About ([0-9,]+) results', msg)
180        if not result:
181            return
182        return int(result.group(1).replace(',', ''))
183
184
185async def main():
186    async with Actor:
187        # Read the Actor input
188        actor_input = await Actor.get_input() or {}
189        query = actor_input.get('query', 'trump')
190
191        async with GoogleSearch() as gs:
192            result = await gs.search(query)
193
194        await Actor.push_data(result)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23
24.scrapy
25*.log

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.0.0
5beautifulsoup4 ~= 4.9.0
6requests ~= 2.28.2
7aiohttp
8parsel
Developer
Maintained by Community
Actor metrics
  • 13 monthly users
  • 1 star
  • 100.0% runs succeeded
  • Created in Mar 2023
  • Modified over 1 year ago
Categories