HAR files for URL list
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsHAR files for URL list
apify/har-files-for-url-list
Generates an HTTP Archive (HAR) file for web pages specified by a list of URLs in JSON or CSV file. Optionally, the pages can be loaded using proxies from a specific country. The resulting HAR files are stored in the key-value store or in the dataset.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10",
9 "puppeteer-har": "latest",
10 "type-is": "latest"
11 },
12 "scripts": {
13 "start": "node main.js"
14 }
15}
main.js
1const Apify = require('apify');
2const PuppeteerHar = require('puppeteer-har');
3const crypto = require('crypto');
4const typeis = require('type-is');
5
6// Some pages might take a long time to open
7const NAVIGATION_TIMEOUT_SECS = 120;
8
9const COUNTRY_CODE_TO_PROXY_GROUP = {
10 UK: "LaxcWp84azNk7xa7P",
11 US: "TGByFdWkWE5cAnmDg",
12 CZ: "di5BsrawgufWetctm",
13 DK: "r8jdq6jyPjpa9c3Lw",
14 BR: "so5Zy6Xc62Faszhc4",
15 VN: "nYx7ojRzbyaTm9HZ7"
16};
17
18const getKeyValueStoreUrl = (recordKey) => {
19 return `https://api.apify.com/v2/key-value-stores/${process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${recordKey}`;
20};
21
22// Saves resulting HAR file to the key-value store,
23// and adds a row to the dataset with link to it and info about the page
24const saveResultingHar = async (request, reportErrors) => {
25 let resultingHar = null;
26 try {
27 resultingHar = await request.myHar.stop();
28 await assignContentToRequests(request, request.myResponses, resultingHar);
29 } catch(e) {
30 // request.myHar.stop() sometimes fails
31 request.pushErrorMessage(e);
32 }
33
34 delete request.myHar;
35 delete request.myResponses;
36
37 const fileName = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17) + '.har';
38 if (resultingHar) await Apify.setValue(fileName, resultingHar);
39 await Apify.pushData({
40 pageUrl: request.url,
41 harFileUrl: resultingHar ? getKeyValueStoreUrl(fileName) : null,
42 errorMessages: reportErrors ? (request.errorMessages || undefined) : undefined,
43 });
44
45 console.log(`HAR of ${request.url} saved successfully.`);
46
47 // Make sure the request won't be repeated
48 request.myDone = true;
49};
50
51// TODO: This should use requestId and dictionary O(1) look-up!
52const assignContentToRequests = async (request, responses, resultingHar) => {
53 for (const res of responses) {
54 const requestUrl = res.request().url();
55 const harEntry = resultingHar.log.entries.find(entry => entry.request.url.match(requestUrl.substr(0, 100))); // limit length to prevent errors
56 if (!harEntry) {
57 // console.log(`HAR entry not found (page: ${request.url}, request: ${requestUrl})`);
58 continue;
59 }
60 // console.log(`HAR entry found (page: ${request.url}, request: ${requestUrl})`);
61 try {
62 const body = await res.buffer();
63 const headers = res.headers();
64 const encoding = typeis.is(headers["content-type"], ['text/*', 'json', 'javascript'])
65 ? "utf8"
66 : "base64";
67 harEntry.response.content.text = body.toString(encoding);
68 if (encoding === "base64") harEntry.response.content.encoding = "base64";
69 } catch (e) {
70 // console.log(`Error assigning response content (page: ${request.url}, request: ${requestUrl}): ${e.message}`);
71 }
72 }
73};
74
75Apify.main(async () => {
76 // Fetch and check actor input
77 const { country, sourceUrls } = await Apify.getValue('INPUT');
78 if (!sourceUrls || !Array.isArray(sourceUrls)) throw new Error("Input is missing sources declaration.");
79 let proxyGroup = null;
80 if (country) {
81 proxyGroup = COUNTRY_CODE_TO_PROXY_GROUP[country];
82 if (!proxyGroup) throw new Error(`Proxies for the given country: ${country} are not available.`);
83 console.log(`Country set to ${country}, using proxy group ${proxyGroup}`)
84 }
85
86 // Prepare list of URLs to crawl
87 const requestList = new Apify.RequestList({ sources: sourceUrls });
88 await requestList.initialize();
89
90 // Run the crawler
91 const crawler = new Apify.PuppeteerCrawler({
92 requestList,
93 launchPuppeteerOptions: {
94 useApifyProxy: !!proxyGroup,
95 apifyProxyGroups: proxyGroup ? [proxyGroup] : null,
96 },
97 // This is necessary so that each page is processed by a new browser instance, without any cache!
98 retireInstanceAfterRequestCount: 1,
99 // handlePageFunction() can take its time
100 pageOpsTimeoutMillis: 60000,
101 gotoFunction: async ({ request, page }) => {
102 if (request.myDone) return;
103 console.log(`Collecting HAR for ${request.url}`);
104 request.myHar = new PuppeteerHar(page);
105 await request.myHar.start();
106
107 page.setDefaultNavigationTimeout(NAVIGATION_TIMEOUT_SECS * 1000);
108
109 // PuppeteerHar doesn't track response body, so we need to do it ourselves
110 request.myResponses = [];
111 page.on("response", (res) => {
112 if (request.myResponses) request.myResponses.push(res);
113 });
114
115 // NOTE: For some reason, PuppeteerHar skips the first page
116 await page.goto('about:blank');
117 return page.goto(request.url, { waitUntil: ["load", "networkidle2"] })
118 },
119 handlePageFunction: async ({ page, request }) => {
120 if (request.myDone) return;
121 console.log(`Saving HAR for loaded page ${request.url}`);
122 await saveResultingHar(request, false);
123 },
124 handleFailedRequestFunction: async ({ request }) => {
125 if (request.myDone) return;
126 console.log(`Saving HAR for failed page ${request.url}`);
127 await saveResultingHar(request, true);
128 },
129 });
130 await crawler.run();
131});
Developer
Maintained by Community
Categories