Example of how to measure downloaded data from network requests made by a webpage.
- Modified
- Users11
- Runs508
.actor/Dockerfile
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:16
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{
"actorSpecification": 1,
"name": "measure-downloaded-bytes",
"title": "Measure downloaded bytes",
"description": "Example of how to measure downloaded data from network requests made by a webpage.",
"version": "0.0",
"input": "./input-schema.json",
"dockefile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"views": {
"all": {
"title": "Requested sources",
"transformation": {
"fields": [
"url",
"type",
"size"
]
},
"display": {
"component": "table",
"properties": {
"url": {
"label": "URL",
"format": "text"
},
"type": {
"label": "Type",
"format": "text"
},
"size": {
"label": "Size in bytes",
"format": "number"
}
}
}
}
}
}
}
}
.actor/input-schema.json
{
"title": "Measure downloaded bytes",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"title": "Url",
"type": "string",
"description": "URL to measure.",
"prefill": "https://apify.com",
"editor": "textfield"
},
"abortRequests": {
"title": "Abort requests",
"type": "boolean",
"description": "Setting this to true will abort requests to some ad related sources",
"editor": "checkbox"
}
},
"required": [
"url"
]
}
.dockerignore
# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git
.editorconfig
root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
.eslintrc
{
"extends": "@apify",
"root": true
}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
node_modules
storage
main.js
import { launchPuppeteer } from '@crawlee/puppeteer';
import { Actor } from 'apify';
import prettyBytes from 'pretty-bytes';
async function saveScreen(page, key = 'debug-screen.png') {
const screenshotBuffer = await page.screenshot({ fullPage: true });
await Actor.setValue(key, screenshotBuffer, { contentType: 'image/png' });
};
async function main() {
try {
const input = await Actor.getInput();
const browser = await launchPuppeteer();
const page = await browser.newPage();
if (input.abortRequests) {
await page.setRequestInterception(true);
page.on('request', (request) => {
const url = request.url();
const filters = [
'livefyre',
'moatad',
'analytics',
'controltag',
'chartbeat',
];
const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
if (shouldAbort) request.abort();
else request.continue();
});
}
const responses = [];
page.on('response', async(response) => {
const url = response.url();
let size = 0;
try {
const buffer = await response.buffer();
size = buffer.byteLength;
} catch (e) {
// Ignore error here, response can be empty
}
responses.push({
url: url,
type: response.request().resourceType(),
size,
});
});
const startedAt = Date.now();
console.log('Opening page', input.url);
await page.goto(input.url, { timeout: 5 * 60 * 1000, waitUntil: 'networkidle2' });
await saveScreen(page);
let totalSize = 0;
const byResourceType = {};
responses.forEach(({ type, size }) => {
if (!byResourceType[type]) byResourceType[type] = { count: 0, size: 0 };
byResourceType[type].count++;
byResourceType[type].size += size;
totalSize += size;
});
await Actor.pushData(responses);
console.log('Page finished loading after', Date.now() - startedAt, 'ms');
console.log(`Responses: ${responses.length} (${prettyBytes(totalSize)})`);
console.log('----------------');
console.log('By resource type');
Object.keys(byResourceType).forEach(type => {
const data = byResourceType[type];
console.log(`${type}: ${data.count} (${prettyBytes(data.size)})`);
});
await page.close();
await browser.close();
} catch (error) {
console.error(error.message);
}
}
await Actor.init();
await main();
// Exit successfully
await Actor.exit();
package.json
{
"name": "measure-downloaded-bytes-actor",
"version": "0.0.2",
"description": "",
"author": "Jaroslav Hejlek",
"license": "ISC",
"type": "module",
"engines": {
"node": ">=16.0.0"
},
"dependencies": {
"apify": "^3.1.0",
"crawlee": "^3.1.1",
"pretty-bytes": "^6.0.0",
"puppeteer": "^19.2.2"
},
"devDependencies": {
"@apify/eslint-config": "^0.3.1",
"eslint": "^8.20.0"
},
"scripts": {
"start": "node main.js",
"lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
"lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
}
}