Probe Page Resources avatar
Probe Page Resources

Pricing

Pay per usage

Go to Store
Probe Page Resources

Probe Page Resources

Developed by

Jan Čurn

Jan Čurn

Maintained by Community

Sequentially loads a list of URLs in headless Chrome and analyzes HTTP resources requested by each page. Source code at https://github.com/jancurn/act-probe-page-resources

0.0 (0)

Pricing

Pay per usage

3

Total users

34

Monthly users

1

Runs succeeded

97%

Last modified

2 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-puppeteer-chrome
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

main.js

1const chromeLauncher = require('chrome-launcher');
2const CDP = require('chrome-remote-interface');
3const _ = require('underscore');
4const Apify = require('apify');
5const typeCheck = require('type-check').typeCheck;
6
7
8// Definition of the input
9const INPUT_TYPE = `{
10 urls: [String],
11 waitSecs: Maybe Number,
12 verboseLog: Maybe Boolean,
13 headers: Maybe Object
14}`;
15
16
17Apify.main(async () => {
18 // Fetch and check the input
19 const input = await Apify.getValue('INPUT');
20 if (!typeCheck(INPUT_TYPE, input)) {
21 console.log('Expected input:');
22 console.log(INPUT_TYPE);
23 console.log('Received input:');
24 console.dir(input);
25 throw new Error('Received invalid input');
26 }
27
28 // Launch Chrome
29 const chrome = await launchChrome({
30 headless: !!process.env.APIFY_HEADLESS,
31 verboseLog: input.verboseLog
32 });
33 const client = await CDP({ port: chrome.port });
34
35 let currentResult = null;
36
37 // Extract domains
38 const { Network, Page } = client;
39
40 // Add HTTP headers
41 if (input.headers) {
42 await Network.setExtraHTTPHeaders({ headers: input.headers });
43 if (input.headers['User-Agent']) await Network.setUserAgentOverride({ userAgent: input.headers['User-Agent'] });
44 }
45
46 // Setup event handlers
47 await Network.requestWillBeSent((params) => {
48 //console.log("### Network.requestWillBeSent");
49 //console.dir(params);
50
51 let req = currentResult.requests[params.requestId];
52 if (!req) {
53 req = currentResult.requests[params.requestId] = {};
54 req.url = params.request.url;
55 req.method = params.request.method;
56 req.requestedAt = new Date(params.wallTime * 1000);
57 } else {
58 // On redirects, the Network.requestWillBeSent() is fired multiple times
59 // with the same requestId and the subsequent requests contain the 'redirectResponse' field
60 req.redirects = req.redirects || [];
61 const redirect = _.pick(params.redirectResponse, 'url', 'status');
62 redirect.location = params.redirectResponse && params.redirectResponse.headers ? params.redirectResponse.headers['location'] : null;
63 req.redirects.push(redirect);
64 }
65 });
66
67 await Network.responseReceived((params) => {
68 //console.log("### Network.responseReceived");
69 //console.dir(params);
70
71 const req = currentResult.requests[params.requestId];
72 req.status = params.response.status;
73 req.mimeType = params.response.mimeType;
74 req.type = params.type;
75 });
76
77 await Network.loadingFailed((params) => {
78 //console.log("### Network.loadingFailed");
79 //console.dir(params);
80
81 // Note that request failures might come from the previous page
82 const req = currentResult.requests[params.requestId];
83 if (req) {
84 req.type = params.type;
85 req.errorText = params.errorText;
86 req.canceled = params.canceled;
87 }
88 });
89
90 // Enable events
91 await Promise.all([Network.enable(), Page.enable()]);
92
93 // Disable cache
94 await Network.setCacheDisabled({ cacheDisabled: true });
95
96 // Iterate and probe all URLs
97 const results = [];
98 for (let url of input.urls) {
99 console.log(`Navigating to URL: ${url}`);
100 currentResult = {
101 url,
102 requests: {}
103 };
104 results.push(currentResult);
105
106 await Page.navigate({ url });
107 await Page.loadEventFired();
108
109 // Wait input.waitSecs seconds
110 await new Promise((resolve) => setTimeout(resolve, input.waitSecs*1000 || 0));
111 await Page.stopLoading();
112 }
113
114 // Save results
115 await Apify.setValue('OUTPUT', results);
116
117 // Only useful for local development
118 await chrome.kill();
119
120 console.log('Done');
121});
122
123
124// Code inspired by https://developers.google.com/web/updates/2017/04/headless-chrome
125const launchChrome = async (options = {}) => {
126 console.log('Launching Chrome...');
127 const chrome = await chromeLauncher.launch({
128 chromeFlags: [
129 options.headless ? '--disable-gpu' : '',
130 options.headless ? '--headless' : '',
131 '--no-sandbox',
132 ],
133 logLevel: options.verboseLog ? 'verbose' : 'error',
134 });
135
136 const version = await CDP.Version({port: chrome.port});
137 console.log(`Chrome launched (pid: ${chrome.pid}, port: ${chrome.port}, userAgent: ${version['User-Agent']})`);
138
139 return chrome;
140};

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"chrome-launcher": "latest",
"chrome-remote-interface": "latest",
"underscore": "latest",
"apify": "^2.2.2",
"type-check": "latest"
},
"scripts": {
"start": "node main.js"
}
}