1import { Actor } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4await Actor.init();
5
6const input = await Actor.getInput() ?? {};
7
8const {
9 searchQueries = ['artificial intelligence'],
10 language = 'en',
11 country = 'US',
12 maxArticlesPerQuery = 50,
13 timeRange = 'week',
14} = input;
15
16const timeRangeMap = {
17 'hour': 'qdr:h',
18 'day': 'qdr:d',
19 'week': 'qdr:w',
20 'month': 'qdr:m',
21 'year': 'qdr:y',
22};
23
24const crawler = new CheerioCrawler({
25 maxRequestsPerCrawl: searchQueries.length * 10,
26 maxConcurrency: 3,
27 requestHandlerTimeoutSecs: 30,
28
29 async requestHandler({ $, request, log }) {
30 const { label, query, collected = 0 } = request.userData;
31
32 if (label === 'SEARCH') {
33 log.info(`Processing Google News search: ${query}`);
34
35 const articles = [];
36
37
38 $('item').each((i, el) => {
39 const $item = $(el);
40
41 const title = $item.find('title').text().trim();
42 const link = $item.find('link').text().trim();
43 const pubDate = $item.find('pubDate').text().trim();
44 const description = $item.find('description').text().trim();
45 const source = $item.find('source').text().trim();
46 const sourceUrl = $item.find('source').attr('url') || '';
47
48 if (title && link) {
49 articles.push({
50 title: title.replace(/<[^>]*>/g, ''),
51 url: link,
52 publishedAt: pubDate ? new Date(pubDate).toISOString() : null,
53 description: description.replace(/<[^>]*>/g, '').substring(0, 500),
54 source,
55 sourceUrl,
56 query,
57 scrapedAt: new Date().toISOString(),
58 });
59 }
60 });
61
62
63 if (articles.length === 0) {
64 $('article, [data-n-tid], .NiLAwe').each((i, el) => {
65 const $article = $(el);
66 const titleEl = $article.find('h3, h4, [role="heading"]').first();
67 const linkEl = $article.find('a').first();
68 const sourceEl = $article.find('.wEwyrc, .vr1PYe, time').first();
69 const timeEl = $article.find('time').first();
70
71 const title = titleEl.text().trim();
72 const link = linkEl.attr('href');
73 const source = sourceEl.text().trim();
74 const datetime = timeEl.attr('datetime');
75
76 if (title && link) {
77 const fullUrl = link.startsWith('http') ? link :
78 link.startsWith('./') ? `https://news.google.com${link.substring(1)}` :
79 `https://news.google.com${link}`;
80
81 articles.push({
82 title,
83 url: fullUrl,
84 publishedAt: datetime || null,
85 source,
86 query,
87 scrapedAt: new Date().toISOString(),
88 });
89 }
90 });
91 }
92
93 log.info(`Found ${articles.length} articles for "${query}"`);
94
95 for (const article of articles.slice(0, maxArticlesPerQuery - collected)) {
96 await Actor.pushData(article);
97 }
98 }
99 },
100});
101
102
103for (const query of searchQueries) {
104 const tbs = timeRangeMap[timeRange] || '';
105 const rssUrl = `https://news.google.com/rss/search?q=${encodeURIComponent(query)}${tbs ? `+when:${timeRange.charAt(0)}` : ''}&hl=${language}&gl=${country}&ceid=${country}:${language}`;
106
107 await crawler.addRequests([{
108 url: rssUrl,
109 userData: { label: 'SEARCH', query },
110 }]);
111}
112
113await crawler.run();
114await Actor.exit();