HAR files for URL list avatar
HAR files for URL list

Deprecated

Pricing

Pay per usage

Go to Store
HAR files for URL list

HAR files for URL list

Deprecated

Developed by

Apify

Apify

Maintained by Community

Generates an HTTP Archive (HAR) file for web pages specified by a list of URLs in JSON or CSV file. Optionally, the pages can be loaded using proxies from a specific country. The resulting HAR files are stored in the key-value store or in the dataset.

0.0 (0)

Pricing

Pay per usage

4

Total users

39

Monthly users

2

Last modified

2 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10",
"puppeteer-har": "latest",
"type-is": "latest"
},
"scripts": {
"start": "node main.js"
}
}

main.js

1const Apify = require('apify');
2const PuppeteerHar = require('puppeteer-har');
3const crypto = require('crypto');
4const typeis = require('type-is');
5
6// Some pages might take a long time to open
7const NAVIGATION_TIMEOUT_SECS = 120;
8
9const COUNTRY_CODE_TO_PROXY_GROUP = {
10 UK: "LaxcWp84azNk7xa7P",
11 US: "TGByFdWkWE5cAnmDg",
12 CZ: "di5BsrawgufWetctm",
13 DK: "r8jdq6jyPjpa9c3Lw",
14 BR: "so5Zy6Xc62Faszhc4",
15 VN: "nYx7ojRzbyaTm9HZ7"
16};
17
18const getKeyValueStoreUrl = (recordKey) => {
19 return `https://api.apify.com/v2/key-value-stores/${process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${recordKey}`;
20};
21
22// Saves resulting HAR file to the key-value store,
23// and adds a row to the dataset with link to it and info about the page
24const saveResultingHar = async (request, reportErrors) => {
25 let resultingHar = null;
26 try {
27 resultingHar = await request.myHar.stop();
28 await assignContentToRequests(request, request.myResponses, resultingHar);
29 } catch(e) {
30 // request.myHar.stop() sometimes fails
31 request.pushErrorMessage(e);
32 }
33
34 delete request.myHar;
35 delete request.myResponses;
36
37 const fileName = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17) + '.har';
38 if (resultingHar) await Apify.setValue(fileName, resultingHar);
39 await Apify.pushData({
40 pageUrl: request.url,
41 harFileUrl: resultingHar ? getKeyValueStoreUrl(fileName) : null,
42 errorMessages: reportErrors ? (request.errorMessages || undefined) : undefined,
43 });
44
45 console.log(`HAR of ${request.url} saved successfully.`);
46
47 // Make sure the request won't be repeated
48 request.myDone = true;
49};
50
51// TODO: This should use requestId and dictionary O(1) look-up!
52const assignContentToRequests = async (request, responses, resultingHar) => {
53 for (const res of responses) {
54 const requestUrl = res.request().url();
55 const harEntry = resultingHar.log.entries.find(entry => entry.request.url.match(requestUrl.substr(0, 100))); // limit length to prevent errors
56 if (!harEntry) {
57 // console.log(`HAR entry not found (page: ${request.url}, request: ${requestUrl})`);
58 continue;
59 }
60 // console.log(`HAR entry found (page: ${request.url}, request: ${requestUrl})`);
61 try {
62 const body = await res.buffer();
63 const headers = res.headers();
64 const encoding = typeis.is(headers["content-type"], ['text/*', 'json', 'javascript'])
65 ? "utf8"
66 : "base64";
67 harEntry.response.content.text = body.toString(encoding);
68 if (encoding === "base64") harEntry.response.content.encoding = "base64";
69 } catch (e) {
70 // console.log(`Error assigning response content (page: ${request.url}, request: ${requestUrl}): ${e.message}`);
71 }
72 }
73};
74
75Apify.main(async () => {
76 // Fetch and check actor input
77 const { country, sourceUrls } = await Apify.getValue('INPUT');
78 if (!sourceUrls || !Array.isArray(sourceUrls)) throw new Error("Input is missing sources declaration.");
79 let proxyGroup = null;
80 if (country) {
81 proxyGroup = COUNTRY_CODE_TO_PROXY_GROUP[country];
82 if (!proxyGroup) throw new Error(`Proxies for the given country: ${country} are not available.`);
83 console.log(`Country set to ${country}, using proxy group ${proxyGroup}`)
84 }
85
86 // Prepare list of URLs to crawl
87 const requestList = new Apify.RequestList({ sources: sourceUrls });
88 await requestList.initialize();
89
90 // Run the crawler
91 const crawler = new Apify.PuppeteerCrawler({
92 requestList,
93 launchPuppeteerOptions: {
94 useApifyProxy: !!proxyGroup,
95 apifyProxyGroups: proxyGroup ? [proxyGroup] : null,
96 },
97 // This is necessary so that each page is processed by a new browser instance, without any cache!
98 retireInstanceAfterRequestCount: 1,
99 // handlePageFunction() can take its time
100 pageOpsTimeoutMillis: 60000,
101 gotoFunction: async ({ request, page }) => {
102 if (request.myDone) return;
103 console.log(`Collecting HAR for ${request.url}`);
104 request.myHar = new PuppeteerHar(page);
105 await request.myHar.start();
106
107 page.setDefaultNavigationTimeout(NAVIGATION_TIMEOUT_SECS * 1000);
108
109 // PuppeteerHar doesn't track response body, so we need to do it ourselves
110 request.myResponses = [];
111 page.on("response", (res) => {
112 if (request.myResponses) request.myResponses.push(res);
113 });
114
115 // NOTE: For some reason, PuppeteerHar skips the first page
116 await page.goto('about:blank');
117 return page.goto(request.url, { waitUntil: ["load", "networkidle2"] })
118 },
119 handlePageFunction: async ({ page, request }) => {
120 if (request.myDone) return;
121 console.log(`Saving HAR for loaded page ${request.url}`);
122 await saveResultingHar(request, false);
123 },
124 handleFailedRequestFunction: async ({ request }) => {
125 if (request.myDone) return;
126 console.log(`Saving HAR for failed page ${request.url}`);
127 await saveResultingHar(request, true);
128 },
129 });
130 await crawler.run();
131});