1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9import asyncio
10from urllib.parse import urljoin
11from selenium.webdriver.support.ui import WebDriverWait
12from selenium.webdriver.support import expected_conditions as EC
13import undetected_chromedriver as uc
14from selenium.webdriver.chrome.options import Options as ChromeOptions
15from selenium.webdriver.common.by import By
16
17from apify import Actor, Request
18
19
20
21
22
23
24
25async def main() -> None:
26 """Main entry point for the Apify Actor.
27
28 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
29 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
30 the field of web scraping significantly.
31 """
32
33 async with Actor:
34
35 actor_input = await Actor.get_input() or {}
36 start_urls = actor_input.get('start_urls')
37 max_depth = actor_input.get('max_depth', 1)
38
39
40 if not start_urls:
41 Actor.log.info('No start URLs specified in actor input, exiting...')
42 await Actor.exit()
43
44
45 request_queue = await Actor.open_request_queue()
46 Actor.log.info('Launching Chrome WebDriver...')
47 chrome_options = ChromeOptions()
48
49 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
50 chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
51 chrome_options.add_argument('--no-sandbox')
52 chrome_options.add_argument('--disable-dev-shm-usage')
53 driver = uc.Chrome(options=chrome_options)
54
55 for start_url in start_urls:
56 url = start_url.get('url')
57 Actor.log.info(f'Enqueuing {url} ...')
58 new_request = Request.from_url(url, user_data={'depth': 0})
59 await request_queue.add_request(new_request)
60 driver.get(url)
61
62
63
64
65
66
67
68
69 while request := await request_queue.fetch_next_request():
70 url = request.url
71
72 if not isinstance(request.user_data['depth'], (str, int)):
73 raise TypeError('Request.depth is an enexpected type.')
74
75 depth = int(request.user_data['depth'])
76 final = driver.current_url+'/print'
77
78 Actor.log.info(f'Scraping {final} (depth={depth}) ...')
79 driver.get(final)
80 try:
81
82
83 await asyncio.to_thread(driver.get, final)
84
85
86
87
88
89 body_content = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML")
90
91 data = {
92 'url': final,
93 'title': driver.title,
94 'body': body_content,
95 }
96 print(data)
97 import time
98
99
100
101 await Actor.push_data(data)
102 time.sleep(100)
103
104 except Exception:
105 Actor.log.exception(f'Cannot extract data from {url}.')
106
107 finally:
108
109
110 pass
111
112 driver.quit()