CNN Top Stories Checker
Deprecated
Pricing
Pay per usage
Go to Store
CNN Top Stories Checker
Deprecated
Measures data traffic done when CNN top stories are crawled. Optionally can cache responses in memory based on cache-control max-age header. Optionally can also block some tracking and analytics requests.
0.0 (0)
Pricing
Pay per usage
3
Total users
16
Monthly users
1
Last modified
2 years ago
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10",
9 "pretty-bytes": "latest"
10 },
11 "scripts": {
12 "start": "node main.js"
13 }
14}
main.js
1const Apify = require('apify');
2const prettyBytes = require('pretty-bytes');
3
4const mainUrl = 'https://edition.cnn.com';
5
6async function saveScreen(page, key = 'debug-screen.png') {
7 const screenshotBuffer = await page.screenshot({ fullPage: true });
8 await Apify.setValue(key, screenshotBuffer, { contentType: 'image/png' });
9};
10
11async function handleMainPage(page, requestQueue) {
12 const postLinks = await page.evaluate(() => {
13 const linkNodes = document.querySelectorAll('ul[data-vr-zone="home-top-col2"] li h3 a');
14 return Array.from(linkNodes).map(node => node.getAttribute('href'));
15 });
16 await Promise.all(postLinks.map((postLink, position) => {
17 requestQueue.addRequest(new Apify.Request({
18 url: `${mainUrl}${postLink}`,
19 userData: {
20 label: 'post',
21 position,
22 }
23 }));
24 }));
25 await saveScreen(page, `main.png`);
26}
27
28async function handlePostPage(page, position) {
29 await saveScreen(page, `post-${position}.png`);
30}
31
32Apify.main(async() => {
33 try {
34 const input = await Apify.getValue('INPUT');
35 const requestQueue = await Apify.openRequestQueue();
36 requestQueue.addRequest(new Apify.Request({
37 url: mainUrl,
38 userData: {
39 label: 'main',
40 }
41 }));
42
43 const cache = {};
44 const responses = [];
45
46 const crawler = new Apify.PuppeteerCrawler({
47 requestQueue,
48 maxConcurrency: 1,
49 launchPuppeteerOptions: {
50 headless: true,
51 },
52 gotoFunction: async({ page, request }) => {
53 await page.setRequestInterception(true);
54 page.on('request', async(request) => {
55 const url = request.url();
56
57 if (input.abortRequests) {
58 // Check if request should be aborted
59 const filters = [
60 'livefyre',
61 'moatad',
62 'analytics',
63 'controltag',
64 'chartbeat',
65 ];
66 const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
67 if (shouldAbort) {
68 request.abort();
69 return;
70 }
71 }
72
73 // If the url is already cached then immediately respond with localy stored response.
74 if (cache[url]) {
75 await request.respond(cache[url]);
76 return;
77 }
78 request.continue();
79 });
80 page.on('response', async(response) => {
81 const url = response.url();
82 const headers = response.headers();
83 const cacheControl = headers['cache-control'] || '';
84 const maxAgeMatch = cacheControl.match(/max-age=(\d+)/);
85
86 // If specified then max-age is in seconds
87 const maxAge = maxAgeMatch && maxAgeMatch.length > 1 ? parseInt(maxAgeMatch[1], 10) : 0;
88 // const maxAge = 0;
89 if (maxAge && input.cacheResponses) {
90 // This response was loaded from cache ignore it
91 if (!cache[url] || cache[url].expires > Date.now()) return;
92
93 // save response to cache
94 cache[url] = {
95 status: response.status(),
96 headers: response.headers(),
97 body: buffer,
98 expires: Date.now() + (maxAge * 1000),
99 };
100 }
101
102 let size = 0;
103 try {
104 const buffer = await response.buffer();
105 size = buffer.byteLength;
106 } catch (e) {
107 // Ignore error here, response can be empty
108 }
109 responses.push({
110 url: url,
111 type: response.request().resourceType(),
112 size,
113 });
114 });
115 return page.goto(request.url, { timeout: 5 * 60 * 10000, waitUntil: 'networkidle2' });
116 },
117 handlePageFunction: ({ page, request, response }) => {
118 console.log(page.url());
119 if (request.userData.label === 'main') return handleMainPage(page, requestQueue);
120 return handlePostPage(page, request.userData.position);
121 }
122 });
123
124 const startedAt = Date.now();
125
126 await crawler.run();
127
128 let totalSize = 0;
129 const byResourceType = {};
130 responses.forEach(({ type, size }) => {
131 if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
132 byResourceType[type].count++;
133 byResourceType[type].size += size;
134 totalSize += size;
135 });
136
137 console.log('Crawler finished after', Date.now() - startedAt, 'ms');
138 console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
139 console.log('----------------');
140 console.log('By resource type');
141 Object.keys(byResourceType).forEach(type => {
142 const data = byResourceType[type];
143 console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
144 });
145 } catch (error) {
146 console.error(error.message);
147 }
148});