Actor picture

Measure Downloaded Bytes

jaroslavhejlek/measure-downloaded-bytes

Example of how to measure downloaded data from network requests made by a webpage.

No credit card required

Author's avatarJaroslav Hejlek
  • Modified
  • Users11
  • Runs508
Actor picture
Measure Downloaded Bytes

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:16

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "measure-downloaded-bytes",
    "title": "Measure downloaded bytes",
    "description": "Example of how to measure downloaded data from network requests made by a webpage.",
    "version": "0.0",
    "input": "./input-schema.json",
    "dockefile": "./Dockerfile",
    "storages": {
        "dataset": {
            "actorSpecification": 1,
            "views": {
                "all": {
                    "title": "Requested sources",
                    "transformation": {
                        "fields": [
                            "url",
                            "type",
                            "size"
                        ]
                    },
                    "display": {
                        "component": "table",
                        "properties": {
                            "url": {
                                "label": "URL",
                                "format": "text"
                            },
                            "type": {
                                "label": "Type",
                                "format": "text"
                            },
                            "size": {
                                "label": "Size in bytes",
                                "format": "number"
                            }
                        }
                    }
                }
            }
        }
    }
}

.actor/input-schema.json

{
  "title": "Measure downloaded bytes",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "url": {
      "title": "Url",
      "type": "string",
      "description": "URL to measure.",
      "prefill": "https://apify.com",
      "editor": "textfield"
    },
    "abortRequests": {
      "title": "Abort requests",
      "type": "boolean",
      "description": "Setting this to true will abort requests to some ad related sources",
      "editor": "checkbox"
    }
  },
  "required": [
    "url"
  ]
}

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
node_modules
storage

main.js

import { launchPuppeteer } from '@crawlee/puppeteer';
import { Actor } from 'apify';
import prettyBytes from 'pretty-bytes';

async function saveScreen(page, key = 'debug-screen.png') {
    const screenshotBuffer = await page.screenshot({ fullPage: true });
    await Actor.setValue(key, screenshotBuffer, { contentType: 'image/png' });
};

async function main() {
    try {
        const input = await Actor.getInput();
        const browser = await launchPuppeteer();
        const page = await browser.newPage();
        if (input.abortRequests) {
            await page.setRequestInterception(true);
            page.on('request', (request) => {
                const url = request.url();
                const filters = [
                    'livefyre',
                    'moatad',
                    'analytics',
                    'controltag',
                    'chartbeat',
                ];
                const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
                if (shouldAbort) request.abort();
                else request.continue();
            });
        }
        const responses = [];
        page.on('response', async(response) => {
            const url = response.url();
            let size = 0;
            try {
                const buffer = await response.buffer();
                size = buffer.byteLength;
            } catch (e) {
                // Ignore error here, response can be empty
            }
            responses.push({
                url: url,
                type: response.request().resourceType(),
                size,
            });
        });
        const startedAt = Date.now();
        console.log('Opening page', input.url);
        await page.goto(input.url, { timeout: 5 * 60 * 1000, waitUntil: 'networkidle2' });
        await saveScreen(page);

        let totalSize = 0;
        const byResourceType = {};
        responses.forEach(({ type, size }) => {
            if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
            byResourceType[type].count++;
            byResourceType[type].size += size;
            totalSize += size;
        });
        
        await Actor.pushData(responses);

        console.log('Page finished loading after', Date.now() - startedAt, 'ms');
        console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
        console.log('----------------');
        console.log('By resource type');
        Object.keys(byResourceType).forEach(type => {
            const data = byResourceType[type];
            console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
        });

        await page.close();
        await browser.close();
    } catch (error) {
        console.error(error.message);
    }
}

await Actor.init();

await main();

// Exit successfully
await Actor.exit();

package.json

{
    "name": "measure-downloaded-bytes-actor",
    "version": "0.0.2",
    "description": "",
    "author": "Jaroslav Hejlek",
    "license": "ISC",
    "type": "module",
    "engines": {
        "node": ">=16.0.0"
    },
    "dependencies": {
        "apify": "^3.1.0",
        "crawlee": "^3.1.1",
        "pretty-bytes": "^6.0.0",
        "puppeteer": "^19.2.2"
    },
    "devDependencies": {
        "@apify/eslint-config": "^0.3.1",
        "eslint": "^8.20.0"
    },
    "scripts": {
        "start": "node main.js",
        "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
        "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    }
}