1"""Module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9from __future__ import annotations
10
11
12
13
14
15from apify import Actor
16from bs4 import BeautifulSoup
17
18
19
20from httpx import AsyncClient
21
22
23async def main() -> None:
24 """Define a main entry point for the Apify Actor.
25
26 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
27 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
28 the field of web scraping significantly.
29 """
30 async with Actor:
31
32 actor_input = await Actor.get_input() or {'url': 'https://apify.com/'}
33 url = actor_input.get('url')
34 if not url:
35 raise ValueError('Missing "url" attribute in input!')
36
37
38 async with AsyncClient() as client:
39
40 Actor.log.info(f'Sending a request to {url}')
41 response = await client.get(url, follow_redirects=True)
42
43
44 soup = BeautifulSoup(response.content, 'lxml')
45
46
47 headings = []
48 for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
49 heading_object = {'level': heading.name, 'text': heading.text}
50 Actor.log.info(f'Extracted heading: {heading_object}')
51 headings.append(heading_object)
52
53
54 await Actor.push_data(headings)