1import { Actor } from "apify";
2import { CheerioCrawler } from "crawlee";
3
4await Actor.init();
5console.log("Current run ID:", Actor.getEnv().actorRunId);
6
7
8const input = await Actor.getInput();
9const startUrls = input?.startUrls || [
10 { url: "https://www.reddit.com/r/all/" },
11];
12const skipUserPosts = false;
13
14const ignoreStartUrls = input?.ignoreStartUrls === true;
15const searches = input?.searches || [];
16const searchPosts = input?.searchPosts !== false;
17const searchCommunities = input?.searchCommunities === true;
18const searchUsers = input?.searchUsers === true;
19const searchComments = input?.searchComments === true;
20const sort = input?.sort || "new";
21const time = input?.time || "all";
22const includeNSFW = input?.includeNSFW !== false;
23const maxItems = input?.maxPostCount || 10;
24const maxPostCount = input?.maxPostCount || 10;
25const maxComments = input?.maxCommentsPerPost !== undefined ? input.maxCommentsPerPost : 10;
26const maxCommunitiesCount =
27 input?.maxCommunitiesCount !== undefined ? input.maxCommunitiesCount : 2;
28const maxUserCount = input?.maxUserCount !== undefined ? input.maxUserCount : 2;
29const postDateLimit = input?.postDateLimit || null;
30const maxPostAgeDays = input?.maxPostAgeDays !== undefined ? input.maxPostAgeDays : null;
31const debugMode = input?.debugMode === true;
32const startPage = input?.startPage || 1;
33const endPage = input?.endPage || null;
34const skipComments = input?.skipComments === true;
35const maxRequestRetries = input?.maxRequestRetries || 3;
36const maxConcurrency = input?.maxConcurrency || 10;
37const proxyInput = input?.proxy || {
38 useApifyProxy: true,
39 apifyProxyGroups: ["RESIDENTIAL"],
40};
41const scrollTimeout = (input?.scrollTimeout || 40) * 1000;
42
43
44if (!ignoreStartUrls && (!startUrls?.length || !Array.isArray(startUrls))) {
45 throw new Error("Invalid or missing startUrls in input");
46}
47
48if (startPage < 1) {
49 throw new Error("startPage must be at least 1");
50}
51
52if (endPage !== null && endPage < startPage) {
53 throw new Error("endPage must be greater than or equal to startPage");
54}
55
56if (debugMode) {
57 console.log("=== DEBUG MODE ENABLED ===");
58 console.log(
59 "Configuration:",
60 JSON.stringify(
61 {
62 startUrls: startUrls.length,
63
64
65 ignoreStartUrls,
66 searches,
67 searchPosts,
68 searchCommunities,
69 searchUsers,
70 searchComments,
71 sort,
72 time,
73 includeNSFW,
74 maxPostCount,
75 maxCommentsPerPost:maxComments,
76
77
78 postDateLimit,
79 maxPostAgeDays,
80 startPage,
81 endPage,
82 skipComments,
83 maxRequestRetries,
84 maxConcurrency,
85 scrollTimeout,
86 proxyInput,
87 },
88 null,
89 2
90 )
91 );
92}
93
94
95const requestQueue = await Actor.openRequestQueue();
96let totalPostsScraped = 0;
97let totalCommunitiesScraped = 0;
98let totalUsersScraped = 0;
99let totalCommentsScraped = 0;
100let totalItemsPushed = 0;
101
102const effectiveStartUrls = !ignoreStartUrls ? startUrls.filter(urlObj => {
103 let url = typeof urlObj === "string" ? urlObj : urlObj.url;
104 if (!url?.includes("reddit.com")) return false;
105
106 const isUser = url.includes("/user/") || url.includes("/u/");
107 const isCommunity = url.includes("/r/") && !url.includes("/comments/");
108
109 if (isUser && skipUserPosts) return false;
110
111
112 return true;
113}) : [];
114
115const totalValidUrls = effectiveStartUrls.length || 1;
116const postsPerUrl = Math.ceil(maxPostCount / totalValidUrls);
117const urlPostCounts = new Map();
118const commentCountMap = new Map();
119
120
121const postsMap = new Map();
122
123
124function canPushMoreItems() {
125 return totalItemsPushed < maxItems;
126}
127function canPushMoreCommentsItems(postId) {
128 return (commentCountMap.get(postId) ?? 0) < maxComments;
129}
130function incrementCommentCountLimit(postId) {
131 let currentCommentCount = (commentCountMap.get(postId) ?? 0);
132 commentCountMap.set(postId, currentCommentCount + 1);
133}
134
135function canUrlScrapeMorePosts(baseUrl) {
136 if (!baseUrl) return totalPostsScraped < maxPostCount;
137 const currentCount = urlPostCounts.get(baseUrl) || 0;
138 return currentCount < postsPerUrl && totalPostsScraped < maxPostCount;
139}
140
141function incrementUrlPostCount(baseUrl) {
142 if (!baseUrl) return;
143 const currentCount = urlPostCounts.get(baseUrl) || 0;
144 urlPostCounts.set(baseUrl, currentCount + 1);
145}
146
147
148function meetsDateLimit(post) {
149 if (!postDateLimit) return true;
150
151 const postDate = post.created_utc ? new Date(post.created_utc * 1000) : null;
152 if (!postDate) return true;
153
154 const limitDate = new Date(postDateLimit);
155 return postDate >= limitDate;
156}
157
158function debugPostTimestamps(post, title) {
159 console.log(`\n🔍 TIMESTAMP DEBUG for: "${title}"`);
160 console.log('Available timestamp fields:');
161
162 const timestampFields = ['created_utc', 'created', 'retrieved_utc', 'retrieved_on'];
163
164 timestampFields.forEach(field => {
165 if (post[field]) {
166 const asSeconds = new Date(post[field] * 1000);
167 const asMilliseconds = new Date(post[field]);
168
169 console.log(`\n${field}: ${post[field]}`);
170 console.log(` As seconds: ${asSeconds}`);
171 console.log(` As milliseconds: ${asMilliseconds}`);
172 console.log(` Field type: ${post[field] > 10000000000 ? 'MILLISECONDS?' : 'SECONDS?'}`);
173 }
174 });
175
176
177 Object.keys(post).forEach(key => {
178 if (typeof post[key] === 'number' && post[key] > 1000000000 && !timestampFields.includes(key)) {
179 console.log(`\n⚠️ Potential timestamp field "${key}": ${post[key]}`);
180 console.log(` As seconds: ${new Date(post[key] * 1000)}`);
181 console.log(` As milliseconds: ${new Date(post[key])}`);
182 }
183 });
184}
185
186
187function meetsFilterPostDays(post) {
188 if (maxPostAgeDays === null || maxPostAgeDays === undefined) return true;
189
190
191 if (debugMode && post.title) {
192 debugPostTimestamps(post, post.title);
193 }
194
195 let postDate;
196
197
198 if (post.created_utc) {
199 postDate = new Date(post.created_utc * 1000);
200 } else if (post.created) {
201 postDate = new Date(post.created * 1000);
202 } else {
203 if (debugMode) console.log(' No timestamp found, including post');
204 return true;
205 }
206
207 if (!postDate || isNaN(postDate.getTime())) {
208 if (debugMode) console.log(' Invalid date, including post');
209 return true;
210 }
211
212 const now = new Date();
213 const daysDifference = Math.floor((now - postDate) / (1000 * 60 * 60 * 24));
214 const meetsCriteria = daysDifference <= maxPostAgeDays;
215
216 if (debugMode) {
217 console.log(`📅 Date Check - Post: "${post.title?.substring(0, 50)}..."`);
218 console.log(` Post Date: ${postDate}`);
219 console.log(` Now: ${now}`);
220 console.log(` Days Difference: ${daysDifference}`);
221 console.log(` Filter Days: ${maxPostAgeDays}`);
222 console.log(` Meets Criteria: ${meetsCriteria}`);
223
224 if (daysDifference < 0) {
225 console.log(` ⚠️ POST IS FROM THE FUTURE!`);
226 }
227 }
228
229 return meetsCriteria;
230}
231
232
233if (searches && Array.isArray(searches) && searches.length > 0) {
234 for (const searchQuery of searches) {
235 const query =
236 typeof searchQuery === "string"
237 ? searchQuery
238 : searchQuery.query || searchQuery.url;
239 if (!query) continue;
240
241 console.log(`Search mode activated for query: "${query}"`);
242
243 if (searchPosts) {
244 let searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
245 query
246 )}&type=link&sort=${sort}`;
247 if (sort === "top" && time !== "all") {
248 searchUrl += `&t=${time}`;
249 }
250 await requestQueue.addRequest({
251 url: searchUrl,
252 userData: {
253 page: 1,
254 type: "search_posts",
255 query: query,
256 },
257 });
258 console.log("Added search posts URL");
259 }
260
261 if (searchCommunities) {
262 const searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
263 query
264 )}&type=sr&sort=${sort}`;
265 await requestQueue.addRequest({
266 url: searchUrl,
267 userData: {
268 page: 1,
269 type: "search_communities",
270 query: query,
271 },
272 });
273 console.log("Added search communities URL");
274 }
275
276 if (searchUsers) {
277 const searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
278 query
279 )}&type=user`;
280 await requestQueue.addRequest({
281 url: searchUrl,
282 userData: {
283 page: 1,
284 type: "search_users",
285 query: query,
286 },
287 });
288 console.log("Added search users URL");
289 }
290
291 if (searchComments) {
292 const searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
293 query
294 )}&type=comment&sort=${sort}`;
295 await requestQueue.addRequest({
296 url: searchUrl,
297 userData: {
298 page: 1,
299 type: "search_comments",
300 query: query,
301 },
302 });
303 console.log("Added search comments URL");
304 }
305 }
306}
307
308
309if (!ignoreStartUrls) {
310 for (const urlObj of startUrls) {
311 let url = typeof urlObj === "string" ? urlObj : urlObj.url;
312
313 if (url?.includes("reddit.com")) {
314 url = url.replace(/\/$/, "");
315
316
317 const isPost = url.includes("/comments/");
318 const isUser = url.includes("/user/") || url.includes("/u/");
319 const isCommunity = url.includes("/r/") && !isPost;
320
321 if (isUser && skipUserPosts) {
322 console.log(`Skipping user URL: ${url}`);
323 continue;
324 }
325
326
327
328
329
330
331 if (!url.endsWith(".json")) {
332 url = `${url}.json`;
333 }
334
335 if (isCommunity && !url.includes("?")) {
336 url = `${url}?sort=${sort}`;
337 if (sort === "top" && time !== "all") {
338 url += `&t=${time}`;
339 }
340 }
341
342 await requestQueue.addRequest({
343 url,
344 userData: {
345 page: startPage,
346 baseUrl: url.split("?")[0].replace(".json", ""),
347 isPost,
348 isUser,
349 isCommunity,
350 type: isUser ? "user" : isPost ? "post" : "community",
351 },
352 });
353
354 if (debugMode) {
355 console.log(
356 `Added URL: ${url} (Type: ${
357 isUser ? "user" : isPost ? "post" : "community"
358 })`
359 );
360 }
361 }
362 }
363}
364
365
366const proxyConfiguration = await Actor.createProxyConfiguration(proxyInput);
367
368
369const crawler = new CheerioCrawler({
370 proxyConfiguration,
371 requestQueue,
372 maxRequestRetries,
373 maxConcurrency,
374
375 additionalMimeTypes: ["application/json"],
376
377 preNavigationHooks: [
378 async ({ request }) => {
379 request.headers = {
380 "User-Agent":
381 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
382 Accept: "application/json, text/plain, */*",
383 "Accept-Language": "en-US,en;q=0.9",
384 "Accept-Encoding": "gzip, deflate, br",
385 Referer: "https://www.reddit.com/",
386 Origin: "https://www.reddit.com",
387 };
388 },
389 ],
390
391 requestHandler: async ({ request, json, log }) => {
392 if (debugMode) {
393 log.info(`Processing ${request.url} (Type: ${request.userData.type})`);
394 } else {
395 log.info(`Processing ${request.url}`);
396 }
397
398 try {
399 const type = request.userData.type;
400
401 switch (type) {
402 case "post":
403 log.info(`Processing post json : ${json}`);
404 await handlePost(json, request, log);
405 break;
406 case "community":
407 await handleCommunityListing(json, request, log);
408 break;
409 case "user":
410 await handleUserPosts(json, request, log);
411 break;
412 case "search_posts":
413 await handleSearchPosts(json, request, log);
414 break;
415 case "search_communities":
416 await handleSearchCommunities(json, request, log);
417 break;
418 case "search_users":
419 await handleSearchUsers(json, request, log);
420 break;
421 case "search_comments":
422 await handleSearchComments(json, request, log);
423 break;
424 case "comments":
425 await handleComments(json, request, log);
426 break;
427 default:
428
429 if (request.userData.isPost) {
430 await handlePost(json, request, log);
431 } else if (request.userData.isUser) {
432 await handleUserPosts(json, request, log);
433 } else {
434 await handleCommunityListing(json, request, log);
435 }
436 }
437 } catch (error) {
438 log.error(`Failed to process ${request.url}: ${error.message}`);
439 if (debugMode) {
440 console.error("Full error:", error);
441 }
442 throw error;
443 }
444 },
445
446 failedRequestHandler: async ({ request, log, error }) => {
447 log.error(`Request ${request.url} failed after retries: ${error.message}`);
448 },
449});
450
451
452async function handlePost(json, request, log) {
453 if (!canPushMoreItems()) {
454 log.info("Max items limit reached, skipping");
455 return;
456 }
457
458 if (totalPostsScraped >= maxPostCount) {
459 log.info("Max post count reached, skipping");
460 return;
461 }
462
463 if (!Array.isArray(json) || json.length < 1) {
464 log.warning("Invalid post response format");
465 return;
466 }
467
468 const postListing = json[0];
469 const postData = postListing?.data?.children?.[0]?.data;
470
471 if (!postData) {
472 log.warning("No post data found");
473 return;
474 }
475
476
477 if (postData.stickied) {
478 if (debugMode) log.info(`Skipping stickied post: ${postData.title}`);
479 return;
480 }
481
482 if (!includeNSFW && postData.over_18) {
483 log.info(`Skipping NSFW post: ${postData.title}`);
484 return;
485 }
486
487
488 if (!meetsDateLimit(postData)) {
489 if (debugMode)
490 log.info(`Skipping post outside date limit: ${postData.title}`);
491 return;
492 }
493
494
495 if (!meetsFilterPostDays(postData)) {
496 if (debugMode)
497 log.info(`Skipping post outside maxPostAgeDays: ${postData.title}`);
498 return;
499 }
500
501 const post = extractPostData(postData);
502
503
504 if (canPushMoreItems()) {
505 await Actor.pushData(post);
506 totalItemsPushed++;
507 } else {
508 return;
509 }
510
511 log.info(
512 `Extracted post: ${post.title} (Total posts: ${totalPostsScraped}/${maxPostCount})`
513 );
514
515
516 if (!skipComments && maxComments > 0 && postData.permalink) {
517 const commentsUrl = `https://www.reddit.com${postData.permalink}.json`;
518 await requestQueue.addRequest({
519 url: commentsUrl,
520 userData: {
521 type: "comments",
522 postId: postData.id,
523 postTitle: postData.title,
524 communityName: postData.subreddit_name_prefixed || null,
525 },
526 });
527 if (debugMode) {
528 log.info(`Added comments URL for post: ${postData.title}`);
529 }
530 } else {
531
532 if (canPushMoreItems()) {
533 await Actor.pushData(post);
534 totalItemsPushed++;
535 postsMap.delete(postData.id);
536 }
537 }
538}
539
540
541async function handleComments(json, request, log) {
542
543
544
545
546
547
548
549
550
551
552
553
554
555 const commentsListing = json[1];
556 const comments = commentsListing?.data?.children;
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571 const postId = request.userData.postId;
572 const postTitle = request.userData.postTitle;
573 const communityName = request.userData.communityName;
574
575
576 function flattenComments(commentsList) {
577 const results = [];
578
579 function recurse(list) {
580 if (!Array.isArray(list)) return;
581 for (const item of list) {
582 if (results.length >= maxComments) return;
583 if (item.kind !== "t1") continue;
584 const data = item.data;
585 if (
586 !data ||
587 !data.body ||
588 data.body === "[deleted]" ||
589 data.body === "[removed]"
590 )
591 continue;
592
593 let numberOfReplies = 0;
594 if (
595 data.replies &&
596 typeof data.replies === "object" &&
597 data.replies.data?.children
598 ) {
599 numberOfReplies = data.replies.data.children.filter(
600 (c) => c.kind === "t1"
601 ).length;
602 }
603
604 const commentData = {
605 id: data.name || null,
606 parsedId: data.id || null,
607 url: data.permalink
608 ? `https://www.reddit.com${data.permalink}`
609 : null,
610 postId: `t3_${postId}`,
611 parentId: data.parent_id || null,
612 username: data.author || null,
613 userId: data.author_fullname || null,
614 category: communityName?.replace("r/", "") || null,
615 communityName: communityName || null,
616 body: data.body || null,
617 createdAt: data.created_utc
618 ? new Date(data.created_utc * 1000).toISOString()
619 : null,
620 scrapedAt: new Date().toISOString(),
621 upVotes: data.score || 0,
622 numberOfreplies: numberOfReplies,
623 html: data.body_html
624 ? data.body_html
625 .replace(/&/g, "&")
626 .replace(/</g, "<")
627 .replace(/>/g, ">")
628 .replace(/"/g, '"')
629 .replace(/'/g, "'")
630 : null,
631 dataType: "comment",
632 };
633
634 results.push(commentData);
635
636 if (results.length >= maxComments) return;
637
638 if (
639 data.replies &&
640 typeof data.replies === "object" &&
641 data.replies.data?.children
642 ) {
643 recurse(data.replies.data.children);
644 }
645
646 if (results.length >= maxComments) return;
647 }
648 }
649
650 recurse(commentsList);
651 return results;
652 }
653
654 const extractedComments = flattenComments(comments).slice(0, maxComments);
655 const commentCount = extractedComments.length;
656
657
658 for (const comment of extractedComments) {
659 if (!canPushMoreCommentsItems(postId)) {
660 log.info(`Reached maxItems limit (${maxItems}). Stopping comment push.`);
661 break;
662 }
663 await Actor.pushData(comment);
664 incrementCommentCountLimit(postId);
665 }
666
667 log.info(
668 `Extracted ${commentCount} comments for post: ${postTitle} (Total comments: ${totalCommentsScraped}, Total items: ${totalItemsPushed}/${maxItems})`
669 );
670}
671
672
673async function handleCommunityListing(json, request, log) {
674 const currentPage = request.userData.page || 1;
675 const baseUrl = request.userData.baseUrl;
676
677
678 if (currentPage < startPage) {
679 if (debugMode)
680 log.info(`Skipping page ${currentPage} (startPage=${startPage}), continuing to next page`);
681
682
683 const after = json?.data?.after;
684 if (after) {
685 let nextUrl = `${baseUrl}.json?sort=${sort}&after=${after}`;
686 if (sort === "top" && time !== "all") {
687 nextUrl += `&t=${time}`;
688 }
689 await requestQueue.addRequest({
690 url: nextUrl,
691 userData: {
692 page: currentPage + 1,
693 baseUrl: baseUrl,
694 type: "community",
695 },
696 });
697 }
698 return;
699 }
700
701
702 if (endPage !== null && currentPage > endPage) {
703 log.info(`Reached endPage limit (${endPage}). Stopping pagination.`);
704 return;
705 }
706
707 if (!canPushMoreItems()) {
708 log.info("Max items limit reached, stopping");
709 return;
710 }
711
712 if (!canUrlScrapeMorePosts(baseUrl)) {
713 const urlCount = urlPostCounts.get(baseUrl) || 0;
714 log.info(`URL reached its post limit (${urlCount}/${postsPerUrl}). Stopping pagination for this URL.`);
715 return;
716 }
717
718 const data = json?.data;
719 if (!data) {
720 log.warning("No data in response");
721 return;
722 }
723
724 const children = data.children;
725 if (!children || !Array.isArray(children)) {
726 log.warning("No children found in response");
727 return;
728 }
729
730 let postCount = 0;
731 for (const child of children) {
732 if (!canPushMoreItems()) {
733 log.info(`Reached maxItems limit (${maxItems}). Stopping.`);
734 break;
735 }
736 if (!canUrlScrapeMorePosts(baseUrl)) {
737 const urlCount = urlPostCounts.get(baseUrl) || 0;
738 log.info(`URL reached its post limit (${urlCount}/${postsPerUrl}). Stopping.`);
739 break;
740 }
741
742 if (child.kind !== "t3") continue;
743
744 const post = child.data;
745
746
747 if (post.stickied) {
748 if (debugMode) log.info(`Skipping stickied post: ${post.title}`);
749 continue;
750 }
751
752 if (!includeNSFW && post.over_18) {
753 if (debugMode) log.info(`Skipping NSFW post: ${post.title}`);
754 continue;
755 }
756
757
758 if (!meetsDateLimit(post)) {
759 if (debugMode) log.info(`Skipping post outside date limit: ${post.title}`);
760 continue;
761 }
762
763
764 if (!meetsFilterPostDays(post)) {
765 if (debugMode) log.info(`Skipping post outside maxPostAgeDays: ${post.title}`);
766 continue;
767 }
768
769 const postData = extractPostData(post);
770
771
772 postsMap.set(post.id, postData);
773 postCount++;
774 totalPostsScraped++;
775 incrementUrlPostCount(baseUrl);
776
777
778 if (!skipComments && maxComments > 0 && post.permalink) {
779 const commentsUrl = `https://www.reddit.com${post.permalink}.json`;
780 await requestQueue.addRequest({
781 url: commentsUrl,
782 userData: {
783 type: "comments",
784 postId: post.id,
785 postTitle: post.title,
786 communityName: post.subreddit_name_prefixed || null,
787 },
788 });
789 } else {
790
791 if (canPushMoreItems()) {
792 await Actor.pushData(postData);
793 totalItemsPushed++;
794 postsMap.delete(post.id);
795 }
796 }
797 }
798
799 const urlCount = urlPostCounts.get(baseUrl) || 0;
800 log.info(
801 `Extracted ${postCount} posts from page ${currentPage} (URL: ${urlCount}/${postsPerUrl}, Total: ${totalPostsScraped}/${maxPostCount}, Items: ${totalItemsPushed}/${maxItems})`
802 );
803
804
805 if (canPushMoreItems() && canUrlScrapeMorePosts(baseUrl)) {
806 const after = data.after;
807
808 if (after && (endPage === null || currentPage < endPage)) {
809 let nextUrl = `${baseUrl}.json?sort=${sort}&after=${after}`;
810 if (sort === "top" && time !== "all") {
811 nextUrl += `&t=${time}`;
812 }
813
814 await requestQueue.addRequest({
815 url: nextUrl,
816 userData: {
817 page: currentPage + 1,
818 baseUrl: baseUrl,
819 type: "community",
820 },
821 });
822
823 if (debugMode) {
824 log.info(`Added page ${currentPage + 1} to queue`);
825 }
826 } else if (endPage !== null && currentPage >= endPage) {
827 log.info(`Reached endPage (${endPage}). No more pages will be added.`);
828 }
829 }
830 }
831
832
833
834async function handleUserPosts(json, request, log) {
835 if (skipUserPosts) {
836 log.info("Skipping user posts (skipUserPosts=true)");
837 return;
838 }
839
840 const currentPage = request.userData.page || 1;
841 const baseUrl = request.userData.baseUrl;
842
843
844 if (currentPage < startPage) {
845 if (debugMode)
846 log.info(`Skipping page ${currentPage} (startPage=${startPage}), continuing to next page`);
847
848
849 const after = json?.data?.after;
850 if (after) {
851 const nextUrl = `${baseUrl}.json?after=${after}`;
852 await requestQueue.addRequest({
853 url: nextUrl,
854 userData: {
855 page: currentPage + 1,
856 baseUrl: baseUrl,
857 type: "user",
858 },
859 });
860 }
861 return;
862 }
863
864 if (endPage !== null && currentPage > endPage) {
865 log.info(`Reached endPage limit (${endPage}). Stopping pagination.`);
866 return;
867 }
868
869 if (!canPushMoreItems()) {
870 log.info("Max items limit reached, stopping");
871 return;
872 }
873
874 if (!canUrlScrapeMorePosts(baseUrl)) {
875 const urlCount = urlPostCounts.get(baseUrl) || 0;
876 log.info(`URL reached its post limit (${urlCount}/${postsPerUrl}). Stopping.`);
877 return;
878 }
879
880 const data = json?.data;
881
882 if (!data) {
883 log.warning("No data in user response");
884 return;
885 }
886
887 const children = data.children;
888
889 if (!children || !Array.isArray(children)) {
890 log.warning("No children found in user response");
891 return;
892 }
893
894 let postCount = 0;
895 for (const child of children) {
896 if (!canPushMoreItems()) break;
897 if (!canUrlScrapeMorePosts(baseUrl)) break;
898
899 if (child.kind !== "t3") continue;
900
901 const post = child.data;
902
903
904 if (post.stickied) {
905 if (debugMode) log.info(`Skipping stickied post: ${post.title}`);
906 continue;
907 }
908
909 if (!includeNSFW && post.over_18) {
910 if (debugMode) log.info(`Skipping NSFW post: ${post.title}`);
911 continue;
912 }
913
914
915 if (!meetsDateLimit(post)) {
916 if (debugMode)
917 log.info(`Skipping post outside date limit: ${post.title}`);
918 continue;
919 }
920
921
922 if (!meetsFilterPostDays(post)) {
923 if (debugMode)
924 log.info(`Skipping post outside maxPostAgeDays: ${post.title}`);
925 continue;
926 }
927
928 const postData = extractPostData(post);
929
930
931 postsMap.set(post.id, postData);
932 postCount++;
933 totalPostsScraped++;
934 incrementUrlPostCount(baseUrl);
935
936
937 if (!skipComments && maxComments > 0 && post.permalink) {
938 const commentsUrl = `https://www.reddit.com${post.permalink}.json`;
939 await requestQueue.addRequest({
940 url: commentsUrl,
941 userData: {
942 type: "comments",
943 postId: post.id,
944 postTitle: post.title,
945 communityName: post.subreddit_name_prefixed || null,
946 },
947 });
948 } else {
949
950 if (canPushMoreItems()) {
951 await Actor.pushData(postData);
952 totalItemsPushed++;
953 postsMap.delete(post.id);
954 }
955 }
956 }
957
958 const urlCount = urlPostCounts.get(baseUrl) || 0;
959 log.info(
960 `Extracted ${postCount} posts from user page ${currentPage} (URL: ${urlCount}/${postsPerUrl}, Total: ${totalPostsScraped}/${maxPostCount})`
961 );
962
963
964 if (canPushMoreItems() && canUrlScrapeMorePosts(baseUrl)) {
965 const after = data.after;
966
967 if (after && (endPage === null || currentPage < endPage)) {
968 const nextUrl = `${baseUrl}.json?after=${after}`;
969
970 await requestQueue.addRequest({
971 url: nextUrl,
972 userData: {
973 page: currentPage + 1,
974 baseUrl: baseUrl,
975 type: "user",
976 },
977 });
978 }
979 }
980}
981
982
983async function handleSearchPosts(json, request, log) {
984 const currentPage = request.userData.page || 1;
985
986
987 if (currentPage < startPage) {
988 if (debugMode)
989 log.info(`Skipping page ${currentPage} (startPage=${startPage}), continuing to next page`);
990
991
992 const after = json?.data?.after;
993 if (after) {
994 let searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
995 request.userData.query
996 )}&type=link&sort=${sort}&after=${after}`;
997 if (sort === "top" && time !== "all") {
998 searchUrl += `&t=${time}`;
999 }
1000 await requestQueue.addRequest({
1001 url: searchUrl,
1002 userData: {
1003 page: currentPage + 1,
1004 type: "search_posts",
1005 query: request.userData.query,
1006 },
1007 });
1008 }
1009 return;
1010 }
1011 if (endPage !== null && currentPage > endPage) {
1012 log.info(`Reached endPage limit (${endPage}). Stopping pagination.`);
1013 return;
1014 }
1015
1016 if (!canPushMoreItems()) {
1017 log.info("Max items limit reached, stopping");
1018 return;
1019 }
1020
1021 if (totalPostsScraped >= maxPostCount) {
1022 log.info("Max post count reached, skipping search results");
1023 return;
1024 }
1025
1026 const data = json?.data;
1027
1028 if (!data) {
1029 log.warning("No data in search response");
1030 return;
1031 }
1032
1033 const children = data.children;
1034
1035 if (!children || !Array.isArray(children)) {
1036 log.warning("No children found in search response");
1037 return;
1038 }
1039
1040 let postCount = 0;
1041 for (const child of children) {
1042 if (!canPushMoreItems()) break;
1043 if (totalPostsScraped >= maxPostCount) break;
1044
1045 if (child.kind !== "t3") continue;
1046
1047 const post = child.data;
1048
1049
1050 if (post.stickied) {
1051 if (debugMode) log.info(`Skipping stickied post: ${post.title}`);
1052 continue;
1053 }
1054
1055 if (!includeNSFW && post.over_18) {
1056 if (debugMode) log.info(`Skipping NSFW post: ${post.title}`);
1057 continue;
1058 }
1059
1060
1061 if (!meetsDateLimit(post)) {
1062 if (debugMode)
1063 log.info(`Skipping post outside date limit: ${post.title}`);
1064 continue;
1065 }
1066
1067
1068 if (!meetsFilterPostDays(post)) {
1069 if (debugMode)
1070 log.info(`Skipping post outside maxPostAgeDays: ${post.title}`);
1071 continue;
1072 }
1073
1074 const postData = extractPostData(post);
1075
1076
1077 postsMap.set(post.id, postData);
1078 postCount++;
1079 totalPostsScraped++;
1080
1081
1082 if (!skipComments && maxComments > 0 && post.permalink) {
1083 const commentsUrl = `https://www.reddit.com${post.permalink}.json`;
1084 await requestQueue.addRequest({
1085 url: commentsUrl,
1086 userData: {
1087 type: "comments",
1088 postId: post.id,
1089 postTitle: post.title,
1090 communityName: post.subreddit_name_prefixed || null,
1091 },
1092 });
1093 } else {
1094
1095 if (canPushMoreItems()) {
1096 await Actor.pushData(postData);
1097 totalItemsPushed++;
1098 postsMap.delete(post.id);
1099 }
1100 }
1101 }
1102
1103 log.info(
1104 `Extracted ${postCount} posts from search page ${currentPage} (Total: ${totalPostsScraped}/${maxPostCount})`
1105 );
1106
1107
1108 if (canPushMoreItems() && totalPostsScraped < maxPostCount) {
1109 const after = data.after;
1110
1111 if (after && (endPage === null || currentPage < endPage)) {
1112 let searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
1113 request.userData.query
1114 )}&type=link&sort=${sort}&after=${after}`;
1115 if (sort === "top" && time !== "all") {
1116 searchUrl += `&t=${time}`;
1117 }
1118
1119 await requestQueue.addRequest({
1120 url: searchUrl,
1121 userData: {
1122 page: currentPage + 1,
1123 type: "search_posts",
1124 query: request.userData.query,
1125 },
1126 });
1127 }
1128 }
1129}
1130
1131
1132async function handleSearchCommunities(json, request, log) {
1133 if (!canPushMoreItems()) {
1134 log.info("Max items limit reached, stopping");
1135 return;
1136 }
1137
1138 if (totalCommunitiesScraped >= maxCommunitiesCount) {
1139 log.info("Max communities count reached, skipping");
1140 return;
1141 }
1142
1143 const data = json?.data;
1144
1145 if (!data) {
1146 log.warning("No data in communities search response");
1147 return;
1148 }
1149
1150 const children = data.children;
1151
1152 if (!children || !Array.isArray(children)) {
1153 log.warning("No children found in communities search response");
1154 return;
1155 }
1156
1157 let communityCount = 0;
1158 for (const child of children) {
1159 if (!canPushMoreItems()) break;
1160 if (totalCommunitiesScraped >= maxCommunitiesCount) break;
1161
1162 if (child.kind !== "t5") continue;
1163
1164 const community = child.data;
1165
1166 const communityData = {
1167 dataType: "community",
1168 id: community.name || null,
1169 parsedId: community.id || null,
1170 communityName: community.display_name_prefixed || null,
1171 parsedCommunityName: community.display_name || null,
1172 title: community.title || null,
1173 url: community.url ? `https://www.reddit.com${community.url}` : null,
1174 subscribers: community.subscribers || 0,
1175 description: community.public_description || null,
1176 createdAt: community.created_utc
1177 ? new Date(community.created_utc * 1000).toISOString()
1178 : null,
1179 over18: community.over18 || false,
1180 iconUrl: community.icon_img || null,
1181 bannerUrl: community.banner_img || null,
1182 activeUsers: community.accounts_active || 0,
1183 scrapedAt: new Date().toISOString(),
1184 };
1185
1186 await Actor.pushData(communityData);
1187 totalItemsPushed++;
1188 communityCount++;
1189 totalCommunitiesScraped++;
1190 }
1191
1192 log.info(
1193 `Extracted ${communityCount} communities (Total: ${totalCommunitiesScraped}/${maxCommunitiesCount})`
1194 );
1195
1196
1197 if (canPushMoreItems() && totalCommunitiesScraped < maxCommunitiesCount) {
1198 const after = data.after;
1199
1200 if (after) {
1201 const searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
1202 request.userData.query
1203 )}&type=sr&sort=${sort}&after=${after}`;
1204
1205 await requestQueue.addRequest({
1206 url: searchUrl,
1207 userData: {
1208 page: (request.userData.page || 1) + 1,
1209 type: "search_communities",
1210 query: request.userData.query,
1211 },
1212 });
1213 }
1214 }
1215}
1216
1217
1218async function handleSearchUsers(json, request, log) {
1219 if (!canPushMoreItems()) {
1220 log.info("Max items limit reached, stopping");
1221 return;
1222 }
1223
1224 if (totalUsersScraped >= maxUserCount) {
1225 log.info("Max users count reached, skipping");
1226 return;
1227 }
1228
1229 const data = json?.data;
1230
1231 if (!data) {
1232 log.warning("No data in users search response");
1233 return;
1234 }
1235
1236 const children = data.children;
1237
1238 if (!children || !Array.isArray(children)) {
1239 log.warning("No children found in users search response");
1240 return;
1241 }
1242
1243 let userCount = 0;
1244 for (const child of children) {
1245 if (!canPushMoreItems()) break;
1246 if (totalUsersScraped >= maxUserCount) break;
1247
1248 if (child.kind !== "t2") continue;
1249
1250 const user = child.data;
1251
1252 const userData = {
1253 dataType: "user",
1254 userId: user.name || null,
1255 parsedUserId: user.id || null,
1256 username: user.name || null,
1257 iconUrl: user.icon_img || null,
1258 linkKarma: user.link_karma || 0,
1259 commentKarma: user.comment_karma || 0,
1260 createdAt: user.created_utc
1261 ? new Date(user.created_utc * 1000).toISOString()
1262 : null,
1263 isGold: user.is_gold || false,
1264 isMod: user.is_mod || false,
1265 verified: user.verified || false,
1266 scrapedAt: new Date().toISOString(),
1267 };
1268
1269 await Actor.pushData(userData);
1270 totalItemsPushed++;
1271 userCount++;
1272 totalUsersScraped++;
1273 }
1274
1275 log.info(
1276 `Extracted ${userCount} users (Total: ${totalUsersScraped}/${maxUserCount})`
1277 );
1278
1279
1280 if (canPushMoreItems() && totalUsersScraped < maxUserCount) {
1281 const after = data.after;
1282
1283 if (after) {
1284 const searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
1285 request.userData.query
1286 )}&type=user&after=${after}`;
1287
1288 await requestQueue.addRequest({
1289 url: searchUrl,
1290 userData: {
1291 page: (request.userData.page || 1) + 1,
1292 type: "search_users",
1293 query: request.userData.query,
1294 },
1295 });
1296 }
1297 }
1298}
1299
1300
1301async function handleSearchComments(json, request, log) {
1302 const currentPage = request.userData.page || 1;
1303
1304 if (!canPushMoreItems()) {
1305 log.info("Max items limit reached, stopping");
1306 return;
1307 }
1308
1309
1310
1311
1312
1313
1314 const data = json?.data;
1315
1316 if (!data) {
1317 log.warning("No data in search comments response");
1318 return;
1319 }
1320
1321 const children = data.children;
1322
1323 if (!children || !Array.isArray(children)) {
1324 log.warning("No children found in search comments response");
1325 return;
1326 }
1327
1328 let commentCount = 0;
1329 for (const child of children) {
1330 if (!canPushMoreCommentsItems(postInfo.id)) break;
1331
1332 if (child.kind !== "t1") continue;
1333
1334 const comment = child.data;
1335
1336
1337 if (
1338 !comment ||
1339 !comment.body ||
1340 comment.body === "[deleted]" ||
1341 comment.body === "[removed]"
1342 ) {
1343 continue;
1344 }
1345
1346
1347 const postInfo = comment.link_title
1348 ? { title: comment.link_title, id: comment.link_id?.replace("t3_", "") }
1349 : null;
1350 const communityName = comment.subreddit_name_prefixed || null;
1351
1352 const commentData = {
1353 id: comment.name || null,
1354 parsedId: comment.id || null,
1355 url: comment.permalink
1356 ? `https://www.reddit.com${comment.permalink}`
1357 : null,
1358 postId: postInfo?.id ? `t3_${postInfo.id}` : null,
1359 parentId: comment.parent_id || null,
1360 username: comment.author || null,
1361 userId: comment.author_fullname || null,
1362 category: communityName?.replace("r/", "") || null,
1363 communityName: communityName || null,
1364 body: comment.body || null,
1365 createdAt: comment.created_utc
1366 ? new Date(comment.created_utc * 1000).toISOString()
1367 : null,
1368 scrapedAt: new Date().toISOString(),
1369 upVotes: comment.score || 0,
1370 numberOfreplies: comment.num_replies || 0,
1371 html: comment.body_html
1372 ? comment.body_html
1373 .replace(/&/g, "&")
1374 .replace(/</g, "<")
1375 .replace(/>/g, ">")
1376 .replace(/"/g, '"')
1377 .replace(/'/g, "'")
1378 : null,
1379 dataType: "comment",
1380 };
1381
1382 await Actor.pushData(commentData);
1383
1384 commentCount++;
1385 incrementCommentCountLimit(postInfo.id);
1386 }
1387
1388 log.info(
1389 `Extracted ${commentCount} comments from search page ${currentPage} (Total: ${totalCommentsScraped})`
1390 );
1391
1392
1393 if (canPushMoreItems()) {
1394 const after = data.after;
1395
1396 if (after) {
1397 let searchUrl = `https://www.reddit.com/search.json?q=${encodeURIComponent(
1398 request.userData.query
1399 )}&type=comment&sort=${sort}&after=${after}`;
1400 if (sort === "top" && time !== "all") {
1401 searchUrl += `&t=${time}`;
1402 }
1403
1404 await requestQueue.addRequest({
1405 url: searchUrl,
1406 userData: {
1407 page: currentPage + 1,
1408 type: "search_comments",
1409 query: request.userData.query,
1410 },
1411 });
1412 }
1413 }
1414}
1415
1416
1417function extractImages(post) {
1418 const images = [];
1419
1420
1421 if (post.preview?.images?.[0]) {
1422 const previewImage = post.preview.images[0];
1423 if (previewImage.source?.url) {
1424 images.push(previewImage.source.url.replace(/&/g, "&"));
1425 }
1426 }
1427
1428
1429 if (post.media_metadata) {
1430 Object.values(post.media_metadata).forEach((media) => {
1431 if (media.s?.u) {
1432 images.push(media.s.u.replace(/&/g, "&"));
1433 }
1434 });
1435 }
1436
1437 return images;
1438}
1439
1440
1441function isImageUrl(url) {
1442 if (!url) return false;
1443 return /\.(jpg|jpeg|png|gif|webp)$/i.test(url);
1444}
1445
1446
1447function isRedditPostUrl(url) {
1448 if (!url) return false;
1449 return url.includes("reddit.com") || url.startsWith("/r/");
1450}
1451
1452
1453function extractPostData(post) {
1454 const postId = post.id || null;
1455 const fullId = post.name || null;
1456 const permalink = post.permalink || null;
1457
1458 const selftext = post.selftext || "";
1459 const body = selftext.trim() !== "" ? selftext : null;
1460
1461 let html = post.selftext_html || null;
1462 if (html) {
1463 html = html
1464 .replace(/&/g, "&")
1465 .replace(/</g, "<")
1466 .replace(/>/g, ">")
1467 .replace(/"/g, '"')
1468 .replace(/'/g, "'");
1469 }
1470
1471
1472 let postUrl = post.url || null;
1473
1474
1475
1476 let externalLink = null;
1477 if (postUrl && !isImageUrl(postUrl) && !isRedditPostUrl(postUrl)) {
1478 externalLink = postUrl;
1479 }
1480
1481 return {
1482 id: fullId,
1483 parsedId: postId,
1484 url: permalink ? `https://www.reddit.com${permalink}` : null,
1485 username: post.author || null,
1486 userId: post.author_fullname || null,
1487 title: post.title || null,
1488 communityName: post.subreddit_name_prefixed || null,
1489 parsedCommunityName: post.subreddit || null,
1490 body: body,
1491 html: html,
1492 link: externalLink,
1493 numberOfComments: post.num_comments || 0,
1494 flair: post.link_flair_text || null,
1495 upVotes: post.score || 0,
1496 upVoteRatio: post.upvote_ratio || 0,
1497 isVideo: post.is_video || false,
1498 isAd: post.promoted || false,
1499 over18: post.over_18 || false,
1500 thumbnailUrl: post.thumbnail || null,
1501 imageUrls: extractImages(post),
1502 createdAt: post.created_utc
1503 ? new Date(post.created_utc * 1000).toISOString()
1504 : null,
1505 scrapedAt: new Date().toISOString(),
1506 dataType: "post",
1507 };
1508}
1509
1510
1511await crawler.run();
1512
1513
1514for (const [postId, post] of postsMap.entries()) {
1515 if (canPushMoreItems()) {
1516 await Actor.pushData(post);
1517 totalItemsPushed++;
1518 if (debugMode) {
1519 console.log(`Pushed remaining post without comments: ${post.title}`);
1520 }
1521 }
1522}
1523
1524
1525
1526console.log("\n=====================================================");
1527console.log(" 🌐 Reddit Scraping Summary Report");
1528console.log("=====================================================\n");
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538console.log("⚙️ **Configuration Used**");
1539console.log("---------------------------------------------");
1540console.log(`• Max Post Count : ${maxPostCount}`);
1541console.log(`• Max Comments Per Post : ${maxComments}`);
1542console.log(`• Sort Mode : ${sort}`);
1543console.log(`• Time Range : ${time}`);
1544console.log(`• Include NSFW : ${includeNSFW}`);
1545console.log(`• max Post Age Days : ${maxPostAgeDays}\n`);
1546
1547console.log("📄 **Pagination Details**");
1548console.log("---------------------------------------------");
1549console.log(`• Page Range : ${startPage} → ${endPage || "∞ (unlimited)"}\n`);
1550
1551console.log("✅ Scraping Completed Successfully!");
1552console.log("=====================================================\n");
1553
1554
1555await Actor.exit();