Download HTML from URLs avatar
Download HTML from URLs
Try for free

No credit card required

View all Actors
Download HTML from URLs

Download HTML from URLs

mtrunkat/url-list-download-html
Try for free

No credit card required

This actor takes a list of URLs and downloads HTML of each page.

Dockerfile

1FROM apify/actor-node-puppeteer-chrome:16
2
3COPY package*.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional \
7 && echo "Installed NPM packages:" \
8 && (npm list --all || true) \
9 && echo "Node.js version:" \
10 && node --version \
11 && echo "NPM version:" \
12 && npm --version
13
14COPY . ./
15
16ENV APIFY_DISABLE_OUTDATED_WARNING 1
17ENV npm_config_loglevel=silent

INPUT_SCHEMA.json

1{
2    "title": "Input",
3    "type": "object",
4    "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
5    "schemaVersion": 1,
6    "properties": {
7        "requestListSources": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "URLs to start with",
11            "prefill": [
12                { "url": "https://apify.com" }
13            ],
14            "editor": "requestListSources",
15            "minItems": 1
16        },
17        "proxyConfiguration": {
18            "title": "Proxy configuration",
19            "type": "object",
20            "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
21            "prefill": { "useApifyProxy": true },
22            "default": {},
23            "editor": "proxy"
24        },
25        "handlePageTimeoutSecs": {
26            "title": "Page timeout",
27            "type": "integer",
28            "description": "Maximum time the scraper will spend processing one page.",
29            "minimum": 1,
30            "default": 60,
31            "maximum": 360,
32            "unit": "secs"
33        },
34        "maxRequestRetries": {
35            "title": "Maximum request retries",
36            "description": "How many retries before giving up.",
37            "default": 1,
38            "prefill": 1,
39            "type": "integer",
40            "editor": "number"
41        },
42        "useChrome": {
43            "title": "Use Chrome",
44            "type": "boolean",
45            "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
46            "default": false,
47            "groupCaption": "Browser masking options",
48            "groupDescription": "Settings that help mask as a real user and prevent scraper detection."
49        }
50    },
51    "required": ["requestListSources", "proxyConfiguration"]
52}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4    const input = await Apify.getInput();
5 
6    const requestList = await Apify.openRequestList('my-list', input.requestListSources);
7    const proxyConfiguration = await Apify.createProxyConfiguration(input.proxyConfiguration);
8
9    const handlePageFunction = async ({ request, response, page }) => {
10        const { waitForSelector } = request.userData;
11
12        if (waitForSelector) {
13            await page.waitForSelector(waitForSelector);
14        }
15    
16        await Apify.pushData({
17            url: request.url,
18            finishedAt: new Date(),
19            fullHtml: await page.content(),
20            html: await page.evaluate(() => document.body.outerHTML),
21            '#debug': Apify.utils.createRequestDebugInfo(request, response),
22            '#error': false,
23        });
24    };
25    
26    const handleFailedRequestFunction = async ({ request }) => {
27        await Apify.pushData({
28            url: request.url,
29            finishedAt: new Date(),
30            '#debug': Apify.utils.createRequestDebugInfo(request),
31            '#error': true,
32        });
33    };
34
35    const puppeteerCrawler = new Apify.PuppeteerCrawler({
36        requestList,
37        handlePageFunction,
38        handleFailedRequestFunction,
39        proxyConfiguration,
40        useSessionPool: true,
41        sessionPoolOptions: {
42            sessionOptions: {
43                maxErrorScore: 0.5,
44            },
45        },
46        browserPoolOptions: {
47            useFingerprints: true,
48            retireBrowserAfterPageCount: 1,
49            maxOpenPagesPerBrowser: 1, // required to use one IP per tab
50        },
51        persistCookiesPerSession: false,
52        maxRequestRetries: typeof input.maxRequestRetries === 'number' ? input.maxRequestRetries : 1,
53        handlePageTimeoutSecs: input.handlePageTimeoutSecs || 60,
54        launchContext: {
55            useChrome: input.useChrome || false,
56            launchOptions: {
57                headless: false,
58                ignoreHTTPSErrors: true,
59                args: ['--ignore-certificate-errors']
60            }
61        },
62    });
63    
64    await puppeteerCrawler.run();
65});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.3.2",
6        "puppeteer": "^13"
7    },
8    "scripts": {
9        "start": "node main.js"
10    },
11    "author": "Me!"
12}
Developer
Maintained by Community
Actor metrics
  • 51 monthly users
  • 99.9% runs succeeded
  • 11.1 days response time
  • Created in Feb 2018
  • Modified 10 days ago
Categories