
Youtube User / Channel Scraper
Pricing
Pay per usage
Go to Store

Youtube User / Channel Scraper
You can use this collector to automatically collect information about YouTube bloggers, such as nickname, avatar, country, introduction, joining time, number of videos, number of views, number of fans, etc. You only need to determine the urls and ids or channel ids
0.0 (0)
Pricing
Pay per usage
24
Total users
283
Monthly users
42
Runs succeeded
>99%
Last modified
a month ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "youtube-kol-crawler", "title": "Youtube Kol Info Crawler", "description": "Scrape data from youtube page with channel id.", "version": "0.0", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "channelId", "avatar", "banner", "title", "verified", "hasbusinessEmail", "joinDate", "country", "viewCount", "videoCount", "subscriberCount", "description", "links", "indexUrl", "channelUrl" ] }, "display": { "component": "table", "properties": { "channelId": { "label": "Text", "format": "text" }, "avatar": { "label": "Image", "format": "image" }, "banner": { "label": "Image", "format": "image" }, "title": { "label": "Text", "format": "text" }, "verified": { "label": "Boolean", "format": "boolean" }, "hasbusinessEmail": { "label": "Boolean", "format": "boolean" }, "indexUrl": { "label": "Link", "format": "link" }, "channelUrl": { "label": "Link", "format": "link" }, "description": { "label": "Text", "format": "text" }, "joinDate": { "label": "Text", "format": "text" }, "country": { "label": "Text", "format": "text" }, "links": { "label": "Array", "format": "array" }, "viewCount": { "label": "Number", "format": "number" }, "videoCount": { "label": "Number", "format": "number" }, "subscriberCount": { "label": "Number", "format": "number" } } } } } } }}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "Paste and input the YouTube homepage URLs,also supports channel URLs", "editor": "requestListSources", "prefill": [{"url": "https://www.youtube.com/@BBCNews"}] }, "ids": { "title": "IDs", "type": "string", "description": "Paste and input the YouTube ids, also supports channel ids, multiple inputs are separated by ',' ", "editor": "textfield", "prefill": "UCyBD3P9YOFWNIMTuDzqeObg" } }}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1"""2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.3
4Feel free to modify this file to suit your specific needs.5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:7https://docs.apify.com/sdk/python8"""9
10# Beautiful Soup - library for pulling data out of HTML and XML files, read more at11# https://www.crummy.com/software/BeautifulSoup/bs4/doc12
13# HTTPX - library for making asynchronous HTTP requests in Python, read more at https://www.python-httpx.org/14import json15import re16
17import requests18from lxml import etree19
20# Apify SDK - toolkit for building Apify Actors, read more at https://docs.apify.com/sdk/python21from apify import Actor22from apify.storages import KeyValueStore23
24
25def get_count(text):26 """27 提取数量28 :param text: 提取对象字符串29 :return: 数量30 """31 if text:32 count = text.replace(",", "").split(" ")[0]33 if "K" in count:34 count = float(count.split("K")[0]) * 100035 elif "M" in count:36 count = float(count.split("M")[0]) * 100000037 return int(count)38
39
40def get_link_dict(links):41 """42 提取链接详情43 :param links: 提取对象列表44 :return: 提取后的数据列表45 """46 link_list = []47 for link in links:48 item = dict()49 item["title"] = link["channelExternalLinkViewModel"]["title"]["content"]50 item["link"] = link["channelExternalLinkViewModel"]["link"]["content"]51 link_list.append(item)52 return link_list53
54
55async def main() -> None:56 """57 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function58 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,59 and it also enhances performance in the field of web scraping significantly.60 """61 async with Actor:62 # Structure of input is defined in input_schema.json63 actor_input = await Actor.get_input() or {}64 start_urls = actor_input.get('start_urls', [])65 ids = actor_input.get('ids')66 Actor.log.info(f'ids: {ids}')67 if ids:68 id_urls = [{"url":"https://www.youtube.com/channel/" + sid} if "@" not in sid else {"url":"https://www.youtube.com/" + sid} for sid in ids.split(',')]69 else:70 id_urls = []71 start_urls.extend(id_urls)72 headings = []73 # Create an asynchronous HTTPX client74 ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"75 headers = {76 "authority": "www.youtube.com",77 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"78 "*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",79 "accept-language": "en-US,en;q=0.5",80 "cache-control": "no-cache",81 "pragma": "no-cache",82 "user-agent": ua83 }84 for url in start_urls:85 Actor.log.info(f'crawling: {url}')86 try:87 response = requests.get(url.get("url"), headers=headers)88 if "This account has been terminated" in response.text:89 Actor.log.info(f"The account has since been suspended:{response.url}")90 else:91 html = etree.HTML(response.text)92 try:93 banner = re.search(r'"url":"(https://[^"]+)","width":2560,"', response.text).group(1)94 except:95 banner = None96 name = html.xpath("//title/text()")[0].split(" -")[0]97 verified = 1 if 'CHECK_CIRCLE_THICK' in response.text else 098 avatar = re.search('\"avatar\":{\"thumbnails\":\[{\"url\":\"(.*?)\"', response.text).group(1)99 kol_token = re.findall('\"token\":\"(.*?)\"', response.text)[-1]100 payload = json.dumps({"context": {101 "client": {"gl": "US", "deviceMake": "Apple", "deviceModel": "",102 "userAgent": ua,103 "clientName": "WEB", "clientVersion": "2.20240224.11.00", "osName": "Macintosh",104 }, "user": {"lockedSafetyMode": False},105 "request": {"useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},106 }, "continuation": kol_token})107 url = "https://www.youtube.com/youtubei/v1/browse?prettyPrint=false"108 response = requests.request("POST", url, headers=headers, data=payload)109 info_dict = response.json()["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"][110 "continuationItems"][0]["aboutChannelRenderer"]["metadata"]["aboutChannelViewModel"]111 # Extract all headings from the page (tag name and text)112 item = {}113 item["channelId"] = info_dict.get("channelId") # 博主ID114 item["avatar"] = avatar # 用户头像115 item["banner"] = banner # 用户背景图片116 item["title"] = name # 博主昵称117 item["verified"] = verified # 是否经过认证118 item["hasbusinessEmail"] = 1 if info_dict.get("signInForBusinessEmail") else 0 # 是否有电子邮箱119 item["indexUrl"] = info_dict.get("canonicalChannelUrl") # 用户主页URL120 item["channelUrl"] = "https://www.youtube.com/channel/" + info_dict.get("channelId")121 item["description"] = info_dict.get("description") # 用户简介122 item["joinDate"] = info_dict.get("joinedDateText").get("content").split("Joined ")[-1] # 注册时间123 item["country"] = info_dict.get("country") # 国家124 links = get_link_dict(info_dict.get("links", []))125 item["links"] = links if links else None # 获取链接详情字典126 item["viewCount"] = get_count(info_dict.get("viewCountText")) # 视频播放次数127 item["videoCount"] = get_count(info_dict.get("videoCountText")) # 视频数量128 item["subscriberCount"] = get_count(info_dict.get("subscriberCountText")) # 订阅数量129 Actor.log.info(f'Extracted heading: {item}')130 headings.append(item)131 except:132 Actor.log.info(f"There are some problems with the request:{url}")133
134 # Save headings to Dataset - a table-like storage135 await Actor.push_data(headings)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify ~= 1.6.05requests6lxml