Measure Downloaded Bytes avatar

Measure Downloaded Bytes

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Measure Downloaded Bytes

Measure Downloaded Bytes

jaroslavhejlek/measure-downloaded-bytes

Example of how to measure downloaded data from network requests made by a webpage.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "measure-downloaded-bytes",
4    "title": "Measure downloaded bytes",
5    "description": "Example of how to measure downloaded data from network requests made by a webpage.",
6    "version": "0.0",
7    "input": "./input-schema.json",
8    "dockefile": "./Dockerfile",
9    "storages": {
10        "dataset": {
11            "actorSpecification": 1,
12            "views": {
13                "all": {
14                    "title": "Requested sources",
15                    "transformation": {
16                        "fields": [
17                            "url",
18                            "type",
19                            "size"
20                        ]
21                    },
22                    "display": {
23                        "component": "table",
24                        "properties": {
25                            "url": {
26                                "label": "URL",
27                                "format": "text"
28                            },
29                            "type": {
30                                "label": "Type",
31                                "format": "text"
32                            },
33                            "size": {
34                                "label": "Size in bytes",
35                                "format": "number"
36                            }
37                        }
38                    }
39                }
40            }
41        }
42    }
43}

.actor/input-schema.json

1{
2  "title": "Measure downloaded bytes",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "url": {
7      "title": "Url",
8      "type": "string",
9      "description": "URL to measure.",
10      "prefill": "https://apify.com",
11      "editor": "textfield"
12    },
13    "abortRequests": {
14      "title": "Abort requests",
15      "type": "boolean",
16      "description": "Setting this to true will abort requests to some ad related sources",
17      "editor": "checkbox"
18    }
19  },
20  "required": [
21    "url"
22  ]
23}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5node_modules
6storage

main.js

1import { launchPuppeteer } from '@crawlee/puppeteer';
2import { Actor } from 'apify';
3import prettyBytes from 'pretty-bytes';
4
5async function saveScreen(page, key = 'debug-screen.png') {
6    const screenshotBuffer = await page.screenshot({ fullPage: true });
7    await Actor.setValue(key, screenshotBuffer, { contentType: 'image/png' });
8};
9
10async function main() {
11    try {
12        const input = await Actor.getInput();
13        const browser = await launchPuppeteer();
14        const page = await browser.newPage();
15        if (input.abortRequests) {
16            await page.setRequestInterception(true);
17            page.on('request', (request) => {
18                const url = request.url();
19                const filters = [
20                    'livefyre',
21                    'moatad',
22                    'analytics',
23                    'controltag',
24                    'chartbeat',
25                ];
26                const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
27                if (shouldAbort) request.abort();
28                else request.continue();
29            });
30        }
31        const responses = [];
32        page.on('response', async(response) => {
33            const url = response.url();
34            let size = 0;
35            try {
36                const buffer = await response.buffer();
37                size = buffer.byteLength;
38            } catch (e) {
39                // Ignore error here, response can be empty
40            }
41            responses.push({
42                url: url,
43                type: response.request().resourceType(),
44                size,
45            });
46        });
47        const startedAt = Date.now();
48        console.log('Opening page', input.url);
49        await page.goto(input.url, { timeout: 5 * 60 * 1000, waitUntil: 'networkidle2' });
50        await saveScreen(page);
51
52        let totalSize = 0;
53        const byResourceType = {};
54        responses.forEach(({ type, size }) => {
55            if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
56            byResourceType[type].count++;
57            byResourceType[type].size += size;
58            totalSize += size;
59        });
60        
61        await Actor.pushData(responses);
62
63        console.log('Page finished loading after', Date.now() - startedAt, 'ms');
64        console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
65        console.log('----------------');
66        console.log('By resource type');
67        Object.keys(byResourceType).forEach(type => {
68            const data = byResourceType[type];
69            console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
70        });
71
72        await page.close();
73        await browser.close();
74    } catch (error) {
75        console.error(error.message);
76    }
77}
78
79await Actor.init();
80
81await main();
82
83// Exit successfully
84await Actor.exit();

package.json

1{
2    "name": "measure-downloaded-bytes-actor",
3    "version": "0.0.2",
4    "description": "",
5    "author": "Jaroslav Hejlek",
6    "license": "ISC",
7    "type": "module",
8    "engines": {
9        "node": ">=16.0.0"
10    },
11    "dependencies": {
12        "apify": "^3.1.0",
13        "crawlee": "^3.1.1",
14        "pretty-bytes": "^6.0.0",
15        "puppeteer": "^19.2.2"
16    },
17    "devDependencies": {
18        "@apify/eslint-config": "^0.3.1",
19        "eslint": "^8.20.0"
20    },
21    "scripts": {
22        "start": "node main.js",
23        "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
24        "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
25        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
26    }
27}
Developer
Maintained by Community