1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10
11
12
13
14import json
15import re
16
17import requests
18from lxml import etree
19
20
21from apify import Actor
22from apify.storages import KeyValueStore
23
24
25def get_count(text):
26 """
27 提取数量
28 :param text: 提取对象字符串
29 :return: 数量
30 """
31 if text:
32 count = text.replace(",", "").split(" ")[0]
33 if "K" in count:
34 count = float(count.split("K")[0]) * 1000
35 elif "M" in count:
36 count = float(count.split("M")[0]) * 1000000
37 return int(count)
38
39
40def get_link_dict(links):
41 """
42 提取链接详情
43 :param links: 提取对象列表
44 :return: 提取后的数据列表
45 """
46 link_list = []
47 for link in links:
48 item = dict()
49 item["title"] = link["channelExternalLinkViewModel"]["title"]["content"]
50 item["link"] = link["channelExternalLinkViewModel"]["link"]["content"]
51 link_list.append(item)
52 return link_list
53
54
55async def main() -> None:
56 """
57 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
58 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
59 and it also enhances performance in the field of web scraping significantly.
60 """
61 async with Actor:
62
63 actor_input = await Actor.get_input() or {}
64 start_urls = actor_input.get('start_urls', [])
65 ids = actor_input.get('ids')
66 Actor.log.info(f'ids: {ids}')
67 if ids:
68 id_urls = [{"url":"https://www.youtube.com/channel/" + sid} if "@" not in sid else {"url":"https://www.youtube.com/" + sid} for sid in ids.split(',')]
69 else:
70 id_urls = []
71 start_urls.extend(id_urls)
72 headings = []
73
74 ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
75 headers = {
76 "authority": "www.youtube.com",
77 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
78 "*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
79 "accept-language": "en-US,en;q=0.5",
80 "cache-control": "no-cache",
81 "pragma": "no-cache",
82 "user-agent": ua
83 }
84 for url in start_urls:
85 Actor.log.info(f'crawling: {url}')
86 try:
87 response = requests.get(url.get("url"), headers=headers)
88 if "This account has been terminated" in response.text:
89 Actor.log.info(f"The account has since been suspended:{response.url}")
90 else:
91 html = etree.HTML(response.text)
92 try:
93 banner = re.search(r'"url":"(https://[^"]+)","width":2560,"', response.text).group(1)
94 except:
95 banner = None
96 name = html.xpath("//title/text()")[0].split(" -")[0]
97 verified = 1 if 'CHECK_CIRCLE_THICK' in response.text else 0
98 avatar = re.search('\"avatar\":{\"thumbnails\":\[{\"url\":\"(.*?)\"', response.text).group(1)
99 kol_token = re.findall('\"token\":\"(.*?)\"', response.text)[-1]
100 payload = json.dumps({"context": {
101 "client": {"gl": "US", "deviceMake": "Apple", "deviceModel": "",
102 "userAgent": ua,
103 "clientName": "WEB", "clientVersion": "2.20240224.11.00", "osName": "Macintosh",
104 }, "user": {"lockedSafetyMode": False},
105 "request": {"useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},
106 }, "continuation": kol_token})
107 url = "https://www.youtube.com/youtubei/v1/browse?prettyPrint=false"
108 response = requests.request("POST", url, headers=headers, data=payload)
109 info_dict = response.json()["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"][
110 "continuationItems"][0]["aboutChannelRenderer"]["metadata"]["aboutChannelViewModel"]
111
112 item = {}
113 item["channelId"] = info_dict.get("channelId")
114 item["avatar"] = avatar
115 item["banner"] = banner
116 item["title"] = name
117 item["verified"] = verified
118 item["hasbusinessEmail"] = 1 if info_dict.get("signInForBusinessEmail") else 0
119 item["indexUrl"] = info_dict.get("canonicalChannelUrl")
120 item["channelUrl"] = "https://www.youtube.com/channel/" + info_dict.get("channelId")
121 item["description"] = info_dict.get("description")
122 item["joinDate"] = info_dict.get("joinedDateText").get("content").split("Joined ")[-1]
123 item["country"] = info_dict.get("country")
124 links = get_link_dict(info_dict.get("links", []))
125 item["links"] = links if links else None
126 item["viewCount"] = get_count(info_dict.get("viewCountText"))
127 item["videoCount"] = get_count(info_dict.get("videoCountText"))
128 item["subscriberCount"] = get_count(info_dict.get("subscriberCountText"))
129 Actor.log.info(f'Extracted heading: {item}')
130 headings.append(item)
131 except:
132 Actor.log.info(f"There are some problems with the request:{url}")
133
134
135 await Actor.push_data(headings)