CNN Top Stories Checker
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsCNN Top Stories Checker
jaroslavhejlek/cnn-top-stories
Measures data traffic done when CNN top stories are crawled. Optionally can cache responses in memory based on cache-control max-age header. Optionally can also block some tracking and analytics requests.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10",
9 "pretty-bytes": "latest"
10 },
11 "scripts": {
12 "start": "node main.js"
13 }
14}
main.js
1const Apify = require('apify');
2const prettyBytes = require('pretty-bytes');
3
4const mainUrl = 'https://edition.cnn.com';
5
6async function saveScreen(page, key = 'debug-screen.png') {
7 const screenshotBuffer = await page.screenshot({ fullPage: true });
8 await Apify.setValue(key, screenshotBuffer, { contentType: 'image/png' });
9};
10
11async function handleMainPage(page, requestQueue) {
12 const postLinks = await page.evaluate(() => {
13 const linkNodes = document.querySelectorAll('ul[data-vr-zone="home-top-col2"] li h3 a');
14 return Array.from(linkNodes).map(node => node.getAttribute('href'));
15 });
16 await Promise.all(postLinks.map((postLink, position) => {
17 requestQueue.addRequest(new Apify.Request({
18 url: `${mainUrl}${postLink}`,
19 userData: {
20 label: 'post',
21 position,
22 }
23 }));
24 }));
25 await saveScreen(page, `main.png`);
26}
27
28async function handlePostPage(page, position) {
29 await saveScreen(page, `post-${position}.png`);
30}
31
32Apify.main(async() => {
33 try {
34 const input = await Apify.getValue('INPUT');
35 const requestQueue = await Apify.openRequestQueue();
36 requestQueue.addRequest(new Apify.Request({
37 url: mainUrl,
38 userData: {
39 label: 'main',
40 }
41 }));
42
43 const cache = {};
44 const responses = [];
45
46 const crawler = new Apify.PuppeteerCrawler({
47 requestQueue,
48 maxConcurrency: 1,
49 launchPuppeteerOptions: {
50 headless: true,
51 },
52 gotoFunction: async({ page, request }) => {
53 await page.setRequestInterception(true);
54 page.on('request', async(request) => {
55 const url = request.url();
56
57 if (input.abortRequests) {
58 // Check if request should be aborted
59 const filters = [
60 'livefyre',
61 'moatad',
62 'analytics',
63 'controltag',
64 'chartbeat',
65 ];
66 const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
67 if (shouldAbort) {
68 request.abort();
69 return;
70 }
71 }
72
73 // If the url is already cached then immediately respond with localy stored response.
74 if (cache[url]) {
75 await request.respond(cache[url]);
76 return;
77 }
78 request.continue();
79 });
80 page.on('response', async(response) => {
81 const url = response.url();
82 const headers = response.headers();
83 const cacheControl = headers['cache-control'] || '';
84 const maxAgeMatch = cacheControl.match(/max-age=(\d+)/);
85
86 // If specified then max-age is in seconds
87 const maxAge = maxAgeMatch && maxAgeMatch.length > 1 ? parseInt(maxAgeMatch[1], 10) : 0;
88 // const maxAge = 0;
89 if (maxAge && input.cacheResponses) {
90 // This response was loaded from cache ignore it
91 if (!cache[url] || cache[url].expires > Date.now()) return;
92
93 // save response to cache
94 cache[url] = {
95 status: response.status(),
96 headers: response.headers(),
97 body: buffer,
98 expires: Date.now() + (maxAge * 1000),
99 };
100 }
101
102 let size = 0;
103 try {
104 const buffer = await response.buffer();
105 size = buffer.byteLength;
106 } catch (e) {
107 // Ignore error here, response can be empty
108 }
109 responses.push({
110 url: url,
111 type: response.request().resourceType(),
112 size,
113 });
114 });
115 return page.goto(request.url, { timeout: 5 * 60 * 10000, waitUntil: 'networkidle2' });
116 },
117 handlePageFunction: ({ page, request, response }) => {
118 console.log(page.url());
119 if (request.userData.label === 'main') return handleMainPage(page, requestQueue);
120 return handlePostPage(page, request.userData.position);
121 }
122 });
123
124 const startedAt = Date.now();
125
126 await crawler.run();
127
128 let totalSize = 0;
129 const byResourceType = {};
130 responses.forEach(({ type, size }) => {
131 if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
132 byResourceType[type].count++;
133 byResourceType[type].size += size;
134 totalSize += size;
135 });
136
137 console.log('Crawler finished after', Date.now() - startedAt, 'ms');
138 console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
139 console.log('----------------');
140 console.log('By resource type');
141 Object.keys(byResourceType).forEach(type => {
142 const data = byResourceType[type];
143 console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
144 });
145 } catch (error) {
146 console.error(error.message);
147 }
148});
Developer
Maintained by Community
Categories