CNN Top Stories Checker avatar

CNN Top Stories Checker

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
CNN Top Stories Checker

CNN Top Stories Checker

jaroslavhejlek/cnn-top-stories

Measures data traffic done when CNN top stories are crawled. Optionally can cache responses in memory based on cache-control max-age header. Optionally can also block some tracking and analytics requests.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "pretty-bytes": "latest"
10    },
11    "scripts": {
12        "start": "node main.js"
13    }
14}

main.js

1const Apify = require('apify');
2const prettyBytes = require('pretty-bytes');
3
4const mainUrl = 'https://edition.cnn.com';
5
6async function saveScreen(page, key = 'debug-screen.png') {
7    const screenshotBuffer = await page.screenshot({ fullPage: true });
8    await Apify.setValue(key, screenshotBuffer, { contentType: 'image/png' });
9};
10
11async function handleMainPage(page, requestQueue) {
12    const postLinks = await page.evaluate(() => {
13        const linkNodes = document.querySelectorAll('ul[data-vr-zone="home-top-col2"] li h3 a');
14        return Array.from(linkNodes).map(node => node.getAttribute('href'));
15    });
16    await Promise.all(postLinks.map((postLink, position) => {
17        requestQueue.addRequest(new Apify.Request({
18            url: `${mainUrl}${postLink}`,
19            userData: {
20                label: 'post',
21                position,
22            }
23        }));
24    }));
25    await saveScreen(page, `main.png`);
26}
27
28async function handlePostPage(page, position) {
29    await saveScreen(page, `post-${position}.png`);
30}
31
32Apify.main(async() => {
33    try {
34        const input = await Apify.getValue('INPUT');
35        const requestQueue = await Apify.openRequestQueue();
36        requestQueue.addRequest(new Apify.Request({
37            url: mainUrl,
38            userData: {
39                label: 'main',
40            }
41        }));
42
43        const cache = {};
44        const responses = [];
45
46        const crawler = new Apify.PuppeteerCrawler({
47            requestQueue,
48            maxConcurrency: 1,
49            launchPuppeteerOptions: {
50                headless: true,
51            },
52            gotoFunction: async({ page, request }) => {
53                await page.setRequestInterception(true);
54                page.on('request', async(request) => {
55                    const url = request.url();
56                    
57                    if (input.abortRequests) {
58                        // Check if request should be aborted
59                        const filters = [
60                            'livefyre',
61                            'moatad',
62                            'analytics',
63                            'controltag',
64                            'chartbeat',
65                        ];
66                        const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
67                        if (shouldAbort) {
68                            request.abort();
69                            return;
70                        }
71                    }
72                    
73                    // If the url is already cached then immediately respond with localy stored response.
74                    if (cache[url]) {
75                        await request.respond(cache[url]);
76                        return;
77                    }
78                    request.continue();
79                });
80                page.on('response', async(response) => {
81                    const url = response.url();
82                    const headers = response.headers();
83                    const cacheControl = headers['cache-control'] || '';
84                    const maxAgeMatch = cacheControl.match(/max-age=(\d+)/);
85
86                    // If specified then max-age is in seconds
87                    const maxAge = maxAgeMatch && maxAgeMatch.length > 1 ? parseInt(maxAgeMatch[1], 10) : 0;
88                    // const maxAge = 0;
89                    if (maxAge && input.cacheResponses) {
90                        // This response was loaded from cache ignore it
91                        if (!cache[url] || cache[url].expires > Date.now()) return;
92
93                        // save response to cache
94                        cache[url] = {
95                            status: response.status(),
96                            headers: response.headers(),
97                            body: buffer,
98                            expires: Date.now() + (maxAge * 1000),
99                        };
100                    }
101
102                    let size = 0;
103                    try {
104                        const buffer = await response.buffer();
105                        size = buffer.byteLength;
106                    } catch (e) {
107                        // Ignore error here, response can be empty
108                    }
109                    responses.push({
110                        url: url,
111                        type: response.request().resourceType(),
112                        size,
113                    });
114                });
115                return page.goto(request.url, { timeout: 5 * 60 * 10000, waitUntil: 'networkidle2' });
116            },
117            handlePageFunction: ({ page, request, response }) => {
118                console.log(page.url());
119                if (request.userData.label === 'main') return handleMainPage(page, requestQueue);
120                return handlePostPage(page, request.userData.position);
121            }
122        });
123
124        const startedAt = Date.now();
125
126        await crawler.run();
127
128        let totalSize = 0;
129        const byResourceType = {};
130        responses.forEach(({ type, size }) => {
131            if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
132            byResourceType[type].count++;
133            byResourceType[type].size += size;
134            totalSize += size;
135        });
136
137        console.log('Crawler finished after', Date.now() - startedAt, 'ms');
138        console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
139        console.log('----------------');
140        console.log('By resource type');
141        Object.keys(byResourceType).forEach(type => {
142            const data = byResourceType[type];
143            console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
144        });
145    } catch (error) {
146        console.error(error.message);
147    }
148});
Developer
Maintained by Community
Categories