HAR files for URL list avatar

HAR files for URL list

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
HAR files for URL list

HAR files for URL list

apify/har-files-for-url-list

Generates an HTTP Archive (HAR) file for web pages specified by a list of URLs in JSON or CSV file. Optionally, the pages can be loaded using proxies from a specific country. The resulting HAR files are stored in the key-value store or in the dataset.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "puppeteer-har": "latest",
10        "type-is": "latest"
11    },
12    "scripts": {
13        "start": "node main.js"
14    }
15}

main.js

1const Apify = require('apify');
2const PuppeteerHar = require('puppeteer-har');
3const crypto = require('crypto');
4const typeis = require('type-is');
5
6// Some pages might take a long time to open
7const NAVIGATION_TIMEOUT_SECS = 120;
8
9const COUNTRY_CODE_TO_PROXY_GROUP = {
10    UK: "LaxcWp84azNk7xa7P",
11    US: "TGByFdWkWE5cAnmDg",
12    CZ: "di5BsrawgufWetctm",
13    DK: "r8jdq6jyPjpa9c3Lw",
14    BR: "so5Zy6Xc62Faszhc4",
15    VN: "nYx7ojRzbyaTm9HZ7"
16};
17
18const getKeyValueStoreUrl = (recordKey) => {
19    return `https://api.apify.com/v2/key-value-stores/${process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${recordKey}`;
20};
21
22// Saves resulting HAR file to the key-value store,
23// and adds a row to the dataset with link to it and info about the page 
24const saveResultingHar = async (request, reportErrors) => {
25    let resultingHar = null;
26    try {
27        resultingHar = await request.myHar.stop();
28        await assignContentToRequests(request, request.myResponses, resultingHar);
29    } catch(e) {
30        // request.myHar.stop() sometimes fails
31        request.pushErrorMessage(e);    
32    }
33    
34    delete request.myHar;
35    delete request.myResponses;
36    
37    const fileName = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17) + '.har';
38    if (resultingHar) await Apify.setValue(fileName, resultingHar);
39    await Apify.pushData({
40        pageUrl: request.url,
41        harFileUrl: resultingHar ? getKeyValueStoreUrl(fileName) : null,
42        errorMessages: reportErrors ? (request.errorMessages || undefined) : undefined,
43    });
44    
45    console.log(`HAR of ${request.url} saved successfully.`);
46    
47    // Make sure the request won't be repeated
48    request.myDone = true;
49};
50
51// TODO: This should use requestId and dictionary O(1) look-up!
52const assignContentToRequests = async (request, responses, resultingHar) => {
53    for (const res of responses) {
54        const requestUrl = res.request().url();
55        const harEntry = resultingHar.log.entries.find(entry => entry.request.url.match(requestUrl.substr(0, 100))); // limit length to prevent errors
56        if (!harEntry) {
57            // console.log(`HAR entry not found (page: ${request.url}, request: ${requestUrl})`);
58            continue;
59        }
60        // console.log(`HAR entry found (page: ${request.url}, request: ${requestUrl})`);
61        try {
62            const body = await res.buffer();
63            const headers = res.headers();
64            const encoding = typeis.is(headers["content-type"], ['text/*', 'json', 'javascript'])
65                ? "utf8"
66                : "base64";
67            harEntry.response.content.text = body.toString(encoding);
68            if (encoding === "base64") harEntry.response.content.encoding = "base64";
69        } catch (e) {
70            // console.log(`Error assigning response content (page: ${request.url}, request: ${requestUrl}): ${e.message}`);
71        }
72    }
73};
74
75Apify.main(async () => {
76    // Fetch and check actor input
77    const { country, sourceUrls } = await Apify.getValue('INPUT');
78    if (!sourceUrls || !Array.isArray(sourceUrls)) throw new Error("Input is missing sources declaration.");
79    let proxyGroup = null;
80    if (country) {
81        proxyGroup = COUNTRY_CODE_TO_PROXY_GROUP[country];
82        if (!proxyGroup) throw new Error(`Proxies for the given country: ${country} are not available.`);
83        console.log(`Country set to ${country}, using proxy group ${proxyGroup}`)
84    }
85
86    // Prepare list of URLs to crawl
87    const requestList = new Apify.RequestList({ sources: sourceUrls });
88    await requestList.initialize();
89    
90    // Run the crawler
91    const crawler = new Apify.PuppeteerCrawler({
92        requestList,
93        launchPuppeteerOptions: {
94            useApifyProxy: !!proxyGroup,
95            apifyProxyGroups: proxyGroup ? [proxyGroup] : null,
96        },
97        // This is necessary so that each page is processed by a new browser instance, without any cache!
98        retireInstanceAfterRequestCount: 1,
99        // handlePageFunction() can take its time
100        pageOpsTimeoutMillis: 60000,
101        gotoFunction: async ({ request, page }) => {
102            if (request.myDone) return;
103            console.log(`Collecting HAR for ${request.url}`);
104            request.myHar = new PuppeteerHar(page);
105            await request.myHar.start();
106            
107            page.setDefaultNavigationTimeout(NAVIGATION_TIMEOUT_SECS * 1000);
108            
109            // PuppeteerHar doesn't track response body, so we need to do it ourselves
110            request.myResponses = [];
111            page.on("response", (res) => {
112                if (request.myResponses) request.myResponses.push(res);
113            });
114
115            // NOTE: For some reason, PuppeteerHar skips the first page
116            await page.goto('about:blank');
117            return page.goto(request.url, { waitUntil: ["load", "networkidle2"] })
118        },
119        handlePageFunction: async ({ page, request }) => {
120            if (request.myDone) return;
121            console.log(`Saving HAR for loaded page ${request.url}`);
122            await saveResultingHar(request, false);
123        },
124        handleFailedRequestFunction: async ({ request }) => {
125            if (request.myDone) return;
126            console.log(`Saving HAR for failed page ${request.url}`);
127            await saveResultingHar(request, true);
128        },
129    });
130    await crawler.run();
131});
Developer
Maintained by Community
Categories