1import { Actor } from 'apify';
2import { createCheerioRouter } from 'crawlee';
3
4import type { Input, Item, ReplyTask, SharedState, VideoMeta } from './types.js';
5import {
6 apiContext,
7 apiHeaders,
8 buildApiCfg,
9 extractCommentsTokenFromHtml,
10 findFirst,
11 findMany,
12 getVideoId,
13 numOf,
14 parseOldestInput,
15 parsePublishedAt,
16 readJson,
17 textOf,
18 thumbOf,
19 uniq
20} from './utils.js';
21
22const readInitialData = (html: string) =>
23 readJson(html, [
24 'var ytInitialData = ',
25 'window["ytInitialData"] = ',
26 'ytInitialData = '
27 ]);
28
29const readPlayerResponse = (html: string) =>
30 readJson(html, [
31 'var ytInitialPlayerResponse = ',
32 'window["ytInitialPlayerResponse"] = ',
33 'ytInitialPlayerResponse = '
34 ]);
35
36const readYtcfg = (html: string) =>
37 readJson(html, [
38 'ytcfg.set('
39 ]);
40
41const continuationOf = (node: any): string | null =>
42 node?.continuationEndpoint?.continuationCommand?.token ??
43 node?.buttonRenderer?.command?.continuationCommand?.token ??
44 node?.command?.continuationCommand?.token ??
45 node?.serviceEndpoint?.continuationCommand?.token ??
46 node?.serviceEndpoint?.continuationEndpoint?.continuationCommand?.token ??
47 node?.serviceEndpoint?.commandExecutorCommand?.commands?.[0]?.continuationCommand?.token ??
48 node?.serviceEndpoint?.reloadContinuationItemsCommand?.continuationItems?.[0]?.continuationItemRenderer?.continuationEndpoint?.continuationCommand?.token ??
49 node?.subThreads?.continuationItemRenderer?.continuationEndpoint?.continuationCommand?.token ??
50 node?.continuation?.reloadContinuationData?.continuation ??
51 node?.nextContinuationData?.continuation ??
52 null;
53
54const isCommentsNode = (node: any) => {
55 const panelId = node?.panelIdentifier ?? node?.targetId ?? node?.sectionIdentifier ?? '';
56 return typeof panelId === 'string' && /comment/i.test(panelId);
57};
58
59const contentArraysOf = (node: any) => {
60 const out: any[][] = [];
61 const add = (value: any) => {
62 if (Array.isArray(value) && value.length) out.push(value);
63 };
64 add(node?.onResponseReceivedEndpoints);
65 add(node?.onResponseReceivedActions);
66 add(node?.continuationContents?.itemSectionContinuation?.contents);
67 add(node?.continuationContents?.sectionListContinuation?.contents);
68 add(node?.continuationContents?.sectionListContinuation?.continuations);
69 add(node?.reloadContinuationItemsCommand?.continuationItems);
70 add(node?.appendContinuationItemsAction?.continuationItems);
71 add(node?.contents);
72 return out;
73};
74
75const commentsCountOf = (node: any): number | null => {
76 const header =
77 findFirst(node, (v) => typeof v === 'object' && v?.commentsHeaderRenderer) ??
78 findFirst(node, (v) => typeof v === 'object' && v?.commentsEntryPointHeaderRenderer);
79 if (header?.commentsHeaderRenderer?.countText) return numOf(header.commentsHeaderRenderer.countText);
80 if (header?.commentsHeaderRenderer?.commentsCount) return numOf(header.commentsHeaderRenderer.commentsCount);
81 if (header?.commentsEntryPointHeaderRenderer?.commentCount) return numOf(header.commentsEntryPointHeaderRenderer.commentCount);
82 return null;
83};
84
85const sortTokenOf = (node: any, mode: '0' | '1'): string | null => {
86 const menu = findFirst(node, (v) => typeof v === 'object' && v?.sortFilterSubMenuRenderer);
87 const items = menu?.sortFilterSubMenuRenderer?.subMenuItems;
88 if (!Array.isArray(items) || !items.length) return null;
89 const idx = mode === '0' ? 1 : 0;
90 const exact = items[idx];
91 const exactToken = continuationOf(exact?.serviceEndpoint) ?? continuationOf(exact);
92 if (exactToken) return exactToken;
93 const want = mode === '0' ? /newest/i : /top/i;
94 for (const item of items) {
95 const title = `${item?.title ?? ''}`;
96 if (!want.test(title)) continue;
97 const token = continuationOf(item?.serviceEndpoint) ?? continuationOf(item);
98 if (token) return token;
99 }
100 return null;
101};
102
103const topLevelTokenOf = (node: any): string | null => {
104 const commentSections = findMany(node, (v) => typeof v === 'object' && isCommentsNode(v));
105 for (const section of commentSections) {
106 const token =
107 continuationOf(section) ??
108 continuationOf(section?.itemSectionRenderer) ??
109 continuationOf(section?.continuationItemRenderer) ??
110 continuationOf(section?.itemSectionRenderer?.contents?.find((x: any) => x?.continuationItemRenderer)?.continuationItemRenderer) ??
111 continuationOf(section?.itemSectionRenderer?.continuations?.[0]);
112 if (token) return token;
113 }
114 const itemSections = findMany(node, (v) => typeof v === 'object' && v?.itemSectionRenderer);
115 for (const section of itemSections) {
116 const item = section.itemSectionRenderer;
117 const token =
118 continuationOf(item?.contents?.find((x: any) => x?.continuationItemRenderer)?.continuationItemRenderer) ??
119 continuationOf(item?.continuations?.[0]);
120 if (token && (isCommentsNode(item) || commentsCountOf(item))) return token;
121 }
122 const candidates = findMany(node, (v) => typeof v === 'object' && v?.continuationItemRenderer);
123 for (const candidate of candidates) {
124 const token = continuationOf(candidate.continuationItemRenderer);
125 if (token) return token;
126 }
127 return null;
128};
129
130const replyCountOf = (thread: any): number | null => {
131 const txt =
132 textOf(thread?.commentThreadRenderer?.comment?.commentRenderer?.replyCount) ??
133 textOf(thread?.commentThreadRenderer?.replies?.commentRepliesRenderer?.moreText) ??
134 textOf(thread?.commentThreadRenderer?.replies?.commentRepliesRenderer?.viewReplies?.buttonRenderer?.text);
135 return numOf(txt);
136};
137
138const isOwner = (renderer: any) => renderer?.authorCommentBadge?.authorCommentBadgeRenderer?.icon?.iconType === 'OWNER';
139
140const hasHeart = (renderer: any) => Boolean(renderer?.actionButtons?.commentActionButtonsRenderer?.creatorHeart?.creatorHeartRenderer);
141
142const isPinned = (renderer: any) => Boolean(renderer?.pinnedCommentBadge?.pinnedCommentBadgeRenderer);
143
144const parseLegacyComment = (
145 renderer: any,
146 meta: VideoMeta,
147 type: 'comment' | 'reply',
148 replyToCid: string | null,
149 replyCount: number | null
150): Item | null => {
151 const cid = renderer?.commentId;
152 const comment = textOf(renderer?.contentText);
153 if (!cid || !comment) return null;
154 const publishedTimeText = textOf(renderer?.publishedTimeText);
155 const authorRuns = renderer?.authorText?.runs?.[0];
156 return {
157 url: meta.pageUrl,
158 pageUrl: meta.pageUrl,
159 videoId: meta.videoId,
160 title: meta.title,
161 comment,
162 cid,
163 author: textOf(renderer?.authorText),
164 authorId: authorRuns?.navigationEndpoint?.browseEndpoint?.browseId ?? null,
165 authorUrl: authorRuns?.navigationEndpoint?.commandMetadata?.webCommandMetadata?.url
166 ? `https://www.youtube.com${authorRuns.navigationEndpoint.commandMetadata.webCommandMetadata.url}`
167 : null,
168 authorThumbnail: thumbOf(renderer?.authorThumbnail),
169 publishedTimeText,
170 publishedAt: parsePublishedAt(publishedTimeText),
171 voteCount:
172 numOf(renderer?.voteCount) ??
173 numOf(renderer?.voteCount?.simpleText) ??
174 numOf(renderer?.actionButtons?.commentActionButtonsRenderer?.likeButton?.toggleButtonRenderer?.defaultText),
175 replyCount,
176 commentsCount: meta.commentsCount,
177 authorIsChannelOwner: isOwner(renderer),
178 hasCreatorHeart: hasHeart(renderer),
179 isPinned: isPinned(renderer),
180 type,
181 replyToCid,
182 scrapedAt: new Date().toISOString()
183 };
184};
185
186const entityMapOf = (node: any) => {
187 const map = new Map<string, any>();
188 const mutations = findMany(node, (v) => typeof v === 'object' && v?.payload && (v?.entityKey || v?.payload?.commentEntityPayload?.key));
189 for (const item of mutations) {
190 if (typeof item?.entityKey === 'string') map.set(item.entityKey, item);
191 const commentKey = item?.payload?.commentEntityPayload?.key;
192 if (typeof commentKey === 'string') map.set(commentKey, item);
193 }
194 return map;
195};
196
197const parseViewModelComment = (
198 node: any,
199 meta: VideoMeta,
200 entities: Map<string, any>,
201 parentCid: string | null,
202 fallbackReplyCount: number | null
203): Item | null => {
204 const vm = node?.commentViewModel?.commentViewModel ?? node?.commentViewModel ?? null;
205 if (!vm) return null;
206 const commentKey = vm?.commentKey;
207 const commentMutation = (commentKey ? entities.get(commentKey) : null) ?? null;
208 const payload = commentMutation?.payload?.commentEntityPayload ?? null;
209 const props = payload?.properties ?? null;
210 const toolbarKey = vm?.toolbarStateKey ?? props?.toolbarStateKey ?? null;
211 const toolbarMutation = (toolbarKey ? entities.get(toolbarKey) : null) ?? null;
212 const author = payload?.author ?? null;
213 const toolbar = payload?.toolbar ?? null;
214 const cid = props?.commentId ?? vm?.commentId ?? null;
215 const comment = props?.content?.content ?? textOf(props?.content) ?? null;
216 if (!cid || !comment) return null;
217 const publishedTimeText = props?.publishedTime ?? null;
218 const authorUrl =
219 author?.channelCommand?.innertubeCommand?.commandMetadata?.webCommandMetadata?.url ??
220 (author?.channelId ? `/channel/${author.channelId}` : null);
221 const replyLevel = typeof props?.replyLevel === 'number' ? props.replyLevel : 0;
222 const type: 'comment' | 'reply' = parentCid || replyLevel > 0 ? 'reply' : 'comment';
223 return {
224 url: meta.pageUrl,
225 pageUrl: meta.pageUrl,
226 videoId: meta.videoId,
227 title: meta.title,
228 comment,
229 cid,
230 author: author?.displayName ?? props?.authorButtonA11y ?? null,
231 authorId: author?.channelId ?? null,
232 authorUrl: authorUrl ? (authorUrl.startsWith('http') ? authorUrl : `https://www.youtube.com${authorUrl}`) : null,
233 authorThumbnail:
234 author?.avatarThumbnailUrl ??
235 payload?.avatar?.image?.sources?.[payload?.avatar?.image?.sources?.length - 1]?.url ??
236 null,
237 publishedTimeText,
238 publishedAt: parsePublishedAt(publishedTimeText),
239 voteCount:
240 numOf(toolbar?.likeCountNotliked) ??
241 numOf(toolbar?.likeCountLiked) ??
242 numOf(toolbarMutation?.payload?.engagementToolbarStateEntityPayload?.likeCountA11y),
243 replyCount: numOf(toolbar?.replyCount) ?? fallbackReplyCount,
244 commentsCount: meta.commentsCount,
245 authorIsChannelOwner: Boolean(author?.isCreator),
246 hasCreatorHeart: toolbarMutation?.payload?.engagementToolbarStateEntityPayload?.heartState === 'TOOLBAR_HEART_STATE_HEARTED',
247 isPinned: Boolean(vm?.pinnedText),
248 type,
249 replyToCid: type === 'reply' ? parentCid : null,
250 scrapedAt: new Date().toISOString()
251 };
252};
253
254const scan = (
255 node: any,
256 meta: VideoMeta,
257 entities: Map<string, any>,
258 items: Item[],
259 nextTokens: string[],
260 replyTasks: ReplyTask[],
261 parentCid: string | null = null
262) => {
263 if (!node || typeof node !== 'object') return;
264 if (Array.isArray(node)) {
265 for (const item of node) scan(item, meta, entities, items, nextTokens, replyTasks, parentCid);
266 return;
267 }
268 if (node.commentThreadRenderer) {
269 const thread = node.commentThreadRenderer;
270 const top =
271 parseLegacyComment(thread?.comment?.commentRenderer, meta, 'comment', null, replyCountOf(node)) ??
272 parseViewModelComment(thread, meta, entities, null, replyCountOf(node));
273 if (top) items.push(top);
274 const replyParent = top?.cid ?? null;
275 const contents = thread?.replies?.commentRepliesRenderer?.contents;
276 const subThreadToken = continuationOf(thread?.replies?.commentRepliesRenderer);
277 if (subThreadToken && replyParent) replyTasks.push({ token: subThreadToken, parentCid: replyParent });
278 if (Array.isArray(contents) && replyParent) {
279 for (const entry of contents) {
280 if (entry?.commentRenderer) {
281 const reply = parseLegacyComment(entry.commentRenderer, meta, 'reply', replyParent, null);
282 if (reply) items.push(reply);
283 }
284 const vmReply = parseViewModelComment(entry, meta, entities, replyParent, null);
285 if (vmReply) items.push(vmReply);
286 const token = continuationOf(entry?.continuationItemRenderer ?? entry);
287 if (token) replyTasks.push({ token, parentCid: replyParent });
288 }
289 }
290 for (const [key, value] of Object.entries(thread)) {
291 if (key === 'comment' || key === 'commentViewModel' || key === 'replies') continue;
292 scan(value, meta, entities, items, nextTokens, replyTasks, replyParent);
293 }
294 return;
295 }
296 if (node.commentRenderer) {
297 const item = parseLegacyComment(node.commentRenderer, meta, parentCid ? 'reply' : 'comment', parentCid, null);
298 if (item) items.push(item);
299 return;
300 }
301 if (node.commentViewModel) {
302 const item = parseViewModelComment(node, meta, entities, parentCid, null);
303 if (item) items.push(item);
304 }
305 if (node.continuationItemRenderer) {
306 const token = continuationOf(node.continuationItemRenderer);
307 if (token) {
308 if (parentCid) replyTasks.push({ token, parentCid });
309 else nextTokens.push(token);
310 }
311 }
312 if (node.commentRepliesRenderer) {
313 const token =
314 continuationOf(node.commentRepliesRenderer) ??
315 continuationOf(node.commentRepliesRenderer?.continuations?.[0]) ??
316 continuationOf(node.commentRepliesRenderer?.subThreads) ??
317 continuationOf(node.commentRepliesRenderer?.viewReplies?.buttonRenderer);
318 if (token && parentCid) replyTasks.push({ token, parentCid });
319 }
320 for (const value of Object.values(node)) scan(value, meta, entities, items, nextTokens, replyTasks, parentCid);
321};
322
323const fetchPage = async (sendRequest: any, cfg: any, token: string) => {
324 const res = await sendRequest({
325 url: `https://www.youtube.com/youtubei/v1/next?prettyPrint=false&key=${encodeURIComponent(cfg.key)}`,
326 method: 'POST',
327 headers: apiHeaders(cfg),
328 json: {
329 context: apiContext(cfg),
330 continuation: token
331 },
332 responseType: 'json'
333 });
334 return res.body ?? res;
335};
336
337const charge = async () => {
338 try {
339 await Actor.charge({ eventName: 'item-scraped', count: 1 });
340 } catch {}
341};
342
343export const createRouter = (input: Input, state: SharedState) => {
344 const router = createCheerioRouter();
345
346 router.addDefaultHandler(async ({ request, body, $, sendRequest, log }) => {
347 const html = typeof body === 'string' ? body : Buffer.isBuffer(body) ? body.toString('utf8') : $.html();
348 const pageUrl = request.loadedUrl ?? request.url;
349 const videoId = getVideoId(pageUrl);
350 if (!videoId) throw new Error(`Missing video id for ${pageUrl}`);
351
352 const initialData = readInitialData(html);
353 if (!initialData) throw new Error(`Missing ytInitialData for ${pageUrl}`);
354
355 const player = readPlayerResponse(html);
356 const ytcfg = readYtcfg(html);
357 const cfg = buildApiCfg(html, ytcfg, pageUrl);
358 const ogTitle = $('meta[property="og:title"]').attr('content')?.trim() ?? null;
359 const pageTitle = $('title').text().trim() || null;
360 const meta: VideoMeta = {
361 videoId,
362 pageUrl,
363 title: player?.videoDetails?.title ?? ogTitle ?? pageTitle,
364 commentsCount: commentsCountOf(initialData)
365 };
366 const oldest = parseOldestInput(input.oldestCommentDate);
367 const sortMode: '0' | '1' = oldest ? '0' : input.commentsSortBy ?? '1';
368 const maxComments = Math.max(1, input.maxComments ?? 1);
369 const seen = new Set<string>();
370 const seenTokens = new Set<string>();
371 let pushed = 0;
372 let stopByDate = false;
373
374 const firstToken = topLevelTokenOf(initialData) ?? extractCommentsTokenFromHtml(html);
375 if (!firstToken) {
376 log.warning(`Comments continuation token not found for ${pageUrl}`);
377 return;
378 }
379
380 let page = await fetchPage(sendRequest, cfg, firstToken);
381 const sortToken = sortTokenOf(page, sortMode) ?? sortTokenOf(initialData, sortMode);
382 if (sortToken && sortToken !== firstToken) page = await fetchPage(sendRequest, cfg, sortToken);
383
384 const nextTokens: string[] = [];
385 const replyTasks: ReplyTask[] = [];
386 const emit = async (item: Item) => {
387 if (seen.has(item.cid) || pushed >= maxComments || stopByDate) return;
388 if (oldest && item.publishedAt) {
389 const published = new Date(item.publishedAt);
390 if (!Number.isNaN(published.getTime()) && published < oldest) {
391 stopByDate = true;
392 return;
393 }
394 }
395 seen.add(item.cid);
396 await Actor.pushData(item);
397 await charge();
398 pushed += 1;
399 state.pushed += 1;
400 };
401
402 const process = async (res: any, parentCid: string | null = null) => {
403 const items: Item[] = [];
404 const pageNextTokens: string[] = [];
405 const pageReplyTasks: ReplyTask[] = [];
406 const entities = entityMapOf(res);
407 for (const arr of contentArraysOf(res)) scan(arr, meta, entities, items, pageNextTokens, pageReplyTasks, parentCid);
408 scan(res, meta, entities, items, pageNextTokens, pageReplyTasks, parentCid);
409 for (const item of items) await emit(item);
410 for (const token of uniq(pageNextTokens)) {
411 if (!seenTokens.has(token)) nextTokens.push(token);
412 }
413 for (const task of pageReplyTasks) {
414 if (!seenTokens.has(task.token)) replyTasks.push(task);
415 }
416 return items.length;
417 };
418
419 let parsed = await process(page);
420
421 while ((nextTokens.length || replyTasks.length) && pushed < maxComments && !stopByDate) {
422 const replyTask = replyTasks.shift();
423 const token = replyTask?.token ?? nextTokens.shift();
424 const parentCid = replyTask?.parentCid ?? null;
425 if (!token || seenTokens.has(token)) continue;
426 seenTokens.add(token);
427 const res = await fetchPage(sendRequest, cfg, token);
428 parsed += await process(res, parentCid);
429 }
430
431 if (!parsed) log.warning(`No comments parsed for ${pageUrl}`);
432 });
433
434 return router;
435};