
HAR files for URL list
Deprecated
Pricing
Pay per usage
Go to Store

HAR files for URL list
Deprecated
Generates an HTTP Archive (HAR) file for web pages specified by a list of URLs in JSON or CSV file. Optionally, the pages can be loaded using proxies from a specific country. The resulting HAR files are stored in the key-value store or in the dataset.
0.0 (0)
Pricing
Pay per usage
4
Total users
39
Monthly users
2
Last modified
2 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10", "puppeteer-har": "latest", "type-is": "latest" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2const PuppeteerHar = require('puppeteer-har');3const crypto = require('crypto');4const typeis = require('type-is');5
6// Some pages might take a long time to open7const NAVIGATION_TIMEOUT_SECS = 120;8
9const COUNTRY_CODE_TO_PROXY_GROUP = {10 UK: "LaxcWp84azNk7xa7P",11 US: "TGByFdWkWE5cAnmDg",12 CZ: "di5BsrawgufWetctm",13 DK: "r8jdq6jyPjpa9c3Lw",14 BR: "so5Zy6Xc62Faszhc4",15 VN: "nYx7ojRzbyaTm9HZ7"16};17
18const getKeyValueStoreUrl = (recordKey) => {19 return `https://api.apify.com/v2/key-value-stores/${process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${recordKey}`;20};21
22// Saves resulting HAR file to the key-value store,23// and adds a row to the dataset with link to it and info about the page 24const saveResultingHar = async (request, reportErrors) => {25 let resultingHar = null;26 try {27 resultingHar = await request.myHar.stop();28 await assignContentToRequests(request, request.myResponses, resultingHar);29 } catch(e) {30 // request.myHar.stop() sometimes fails31 request.pushErrorMessage(e); 32 }33 34 delete request.myHar;35 delete request.myResponses;36 37 const fileName = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17) + '.har';38 if (resultingHar) await Apify.setValue(fileName, resultingHar);39 await Apify.pushData({40 pageUrl: request.url,41 harFileUrl: resultingHar ? getKeyValueStoreUrl(fileName) : null,42 errorMessages: reportErrors ? (request.errorMessages || undefined) : undefined,43 });44 45 console.log(`HAR of ${request.url} saved successfully.`);46 47 // Make sure the request won't be repeated48 request.myDone = true;49};50
51// TODO: This should use requestId and dictionary O(1) look-up!52const assignContentToRequests = async (request, responses, resultingHar) => {53 for (const res of responses) {54 const requestUrl = res.request().url();55 const harEntry = resultingHar.log.entries.find(entry => entry.request.url.match(requestUrl.substr(0, 100))); // limit length to prevent errors56 if (!harEntry) {57 // console.log(`HAR entry not found (page: ${request.url}, request: ${requestUrl})`);58 continue;59 }60 // console.log(`HAR entry found (page: ${request.url}, request: ${requestUrl})`);61 try {62 const body = await res.buffer();63 const headers = res.headers();64 const encoding = typeis.is(headers["content-type"], ['text/*', 'json', 'javascript'])65 ? "utf8"66 : "base64";67 harEntry.response.content.text = body.toString(encoding);68 if (encoding === "base64") harEntry.response.content.encoding = "base64";69 } catch (e) {70 // console.log(`Error assigning response content (page: ${request.url}, request: ${requestUrl}): ${e.message}`);71 }72 }73};74
75Apify.main(async () => {76 // Fetch and check actor input77 const { country, sourceUrls } = await Apify.getValue('INPUT');78 if (!sourceUrls || !Array.isArray(sourceUrls)) throw new Error("Input is missing sources declaration.");79 let proxyGroup = null;80 if (country) {81 proxyGroup = COUNTRY_CODE_TO_PROXY_GROUP[country];82 if (!proxyGroup) throw new Error(`Proxies for the given country: ${country} are not available.`);83 console.log(`Country set to ${country}, using proxy group ${proxyGroup}`)84 }85
86 // Prepare list of URLs to crawl87 const requestList = new Apify.RequestList({ sources: sourceUrls });88 await requestList.initialize();89 90 // Run the crawler91 const crawler = new Apify.PuppeteerCrawler({92 requestList,93 launchPuppeteerOptions: {94 useApifyProxy: !!proxyGroup,95 apifyProxyGroups: proxyGroup ? [proxyGroup] : null,96 },97 // This is necessary so that each page is processed by a new browser instance, without any cache!98 retireInstanceAfterRequestCount: 1,99 // handlePageFunction() can take its time100 pageOpsTimeoutMillis: 60000,101 gotoFunction: async ({ request, page }) => {102 if (request.myDone) return;103 console.log(`Collecting HAR for ${request.url}`);104 request.myHar = new PuppeteerHar(page);105 await request.myHar.start();106 107 page.setDefaultNavigationTimeout(NAVIGATION_TIMEOUT_SECS * 1000);108 109 // PuppeteerHar doesn't track response body, so we need to do it ourselves110 request.myResponses = [];111 page.on("response", (res) => {112 if (request.myResponses) request.myResponses.push(res);113 });114
115 // NOTE: For some reason, PuppeteerHar skips the first page116 await page.goto('about:blank');117 return page.goto(request.url, { waitUntil: ["load", "networkidle2"] })118 },119 handlePageFunction: async ({ page, request }) => {120 if (request.myDone) return;121 console.log(`Saving HAR for loaded page ${request.url}`);122 await saveResultingHar(request, false);123 },124 handleFailedRequestFunction: async ({ request }) => {125 if (request.myDone) return;126 console.log(`Saving HAR for failed page ${request.url}`);127 await saveResultingHar(request, true);128 },129 });130 await crawler.run();131});