Probe Page Resources
Try for free
No credit card required
View all Actors
Probe Page Resources
jancurn/probe-page-resources
Try for free
No credit card required
Sequentially loads a list of URLs in headless Chrome and analyzes HTTP resources requested by each page. Source code at https://github.com/jancurn/act-probe-page-resources
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-puppeteer-chrome
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
main.js
1const chromeLauncher = require('chrome-launcher');
2const CDP = require('chrome-remote-interface');
3const _ = require('underscore');
4const Apify = require('apify');
5const typeCheck = require('type-check').typeCheck;
6
7
8// Definition of the input
9const INPUT_TYPE = `{
10 urls: [String],
11 waitSecs: Maybe Number,
12 verboseLog: Maybe Boolean,
13 headers: Maybe Object
14}`;
15
16
17Apify.main(async () => {
18 // Fetch and check the input
19 const input = await Apify.getValue('INPUT');
20 if (!typeCheck(INPUT_TYPE, input)) {
21 console.log('Expected input:');
22 console.log(INPUT_TYPE);
23 console.log('Received input:');
24 console.dir(input);
25 throw new Error('Received invalid input');
26 }
27
28 // Launch Chrome
29 const chrome = await launchChrome({
30 headless: !!process.env.APIFY_HEADLESS,
31 verboseLog: input.verboseLog
32 });
33 const client = await CDP({ port: chrome.port });
34
35 let currentResult = null;
36
37 // Extract domains
38 const { Network, Page } = client;
39
40 // Add HTTP headers
41 if (input.headers) {
42 await Network.setExtraHTTPHeaders({ headers: input.headers });
43 if (input.headers['User-Agent']) await Network.setUserAgentOverride({ userAgent: input.headers['User-Agent'] });
44 }
45
46 // Setup event handlers
47 await Network.requestWillBeSent((params) => {
48 //console.log("### Network.requestWillBeSent");
49 //console.dir(params);
50
51 let req = currentResult.requests[params.requestId];
52 if (!req) {
53 req = currentResult.requests[params.requestId] = {};
54 req.url = params.request.url;
55 req.method = params.request.method;
56 req.requestedAt = new Date(params.wallTime * 1000);
57 } else {
58 // On redirects, the Network.requestWillBeSent() is fired multiple times
59 // with the same requestId and the subsequent requests contain the 'redirectResponse' field
60 req.redirects = req.redirects || [];
61 const redirect = _.pick(params.redirectResponse, 'url', 'status');
62 redirect.location = params.redirectResponse && params.redirectResponse.headers ? params.redirectResponse.headers['location'] : null;
63 req.redirects.push(redirect);
64 }
65 });
66
67 await Network.responseReceived((params) => {
68 //console.log("### Network.responseReceived");
69 //console.dir(params);
70
71 const req = currentResult.requests[params.requestId];
72 req.status = params.response.status;
73 req.mimeType = params.response.mimeType;
74 req.type = params.type;
75 });
76
77 await Network.loadingFailed((params) => {
78 //console.log("### Network.loadingFailed");
79 //console.dir(params);
80
81 // Note that request failures might come from the previous page
82 const req = currentResult.requests[params.requestId];
83 if (req) {
84 req.type = params.type;
85 req.errorText = params.errorText;
86 req.canceled = params.canceled;
87 }
88 });
89
90 // Enable events
91 await Promise.all([Network.enable(), Page.enable()]);
92
93 // Disable cache
94 await Network.setCacheDisabled({ cacheDisabled: true });
95
96 // Iterate and probe all URLs
97 const results = [];
98 for (let url of input.urls) {
99 console.log(`Navigating to URL: ${url}`);
100 currentResult = {
101 url,
102 requests: {}
103 };
104 results.push(currentResult);
105
106 await Page.navigate({ url });
107 await Page.loadEventFired();
108
109 // Wait input.waitSecs seconds
110 await new Promise((resolve) => setTimeout(resolve, input.waitSecs*1000 || 0));
111 await Page.stopLoading();
112 }
113
114 // Save results
115 await Apify.setValue('OUTPUT', results);
116
117 // Only useful for local development
118 await chrome.kill();
119
120 console.log('Done');
121});
122
123
124// Code inspired by https://developers.google.com/web/updates/2017/04/headless-chrome
125const launchChrome = async (options = {}) => {
126 console.log('Launching Chrome...');
127 const chrome = await chromeLauncher.launch({
128 chromeFlags: [
129 options.headless ? '--disable-gpu' : '',
130 options.headless ? '--headless' : '',
131 '--no-sandbox',
132 ],
133 logLevel: options.verboseLog ? 'verbose' : 'error',
134 });
135
136 const version = await CDP.Version({port: chrome.port});
137 console.log(`Chrome launched (pid: ${chrome.pid}, port: ${chrome.port}, userAgent: ${version['User-Agent']})`);
138
139 return chrome;
140};
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "chrome-launcher": "latest",
9 "chrome-remote-interface": "latest",
10 "underscore": "latest",
11 "apify": "^2.2.2",
12 "type-check": "latest"
13 },
14 "scripts": {
15 "start": "node main.js"
16 }
17}
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
3 stars
97% runs succeeded
Created in Aug 2017
Modified a year ago
Categories