Download HTML from URLs avatar

Download HTML from URLs

Try for free

No credit card required

Go to Store
Download HTML from URLs

Download HTML from URLs

Try for free

No credit card required

This actor takes a list of URLs and downloads HTML of each page.

Maintained by Community

Actor Metrics

  • 51 monthly users

  • No reviews yet


  • >99% runs succeeded

  • Created in Feb 2018

  • Modified a year ago


1# Dockerfile contains instructions how to build a Docker image that
2# will contain all the code and configuration needed to run your actor.
3# For a full Dockerfile reference,
4# see
6# First, specify the base Docker image. Apify provides the following
7# base images for your convenience:
8#  apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast)
9#  apify/actor-node-chrome (Node.js 10 + Chrome on Debian)
10#  apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)
11# For more information, see
12# Note that you can use any other image from Docker Hub.
13FROM apify/actor-node-chrome
15# Second, copy just package.json since it should be the only file
16# that affects NPM install in the next step
17COPY package.json ./
19# Install NPM packages, skip optional and development dependencies to
20# keep the image small. Avoid logging too much and print the dependency
21# tree for debugging
22RUN npm --quiet set progress=false \
23 && npm install --only=prod --no-optional \
24 && echo "Installed NPM packages:" \
25 && npm list \
26 && echo "Node.js version:" \
27 && node --version \
28 && echo "NPM version:" \
29 && npm --version
31# Next, copy the remaining files and directories with the source code.
32# Since we do this after NPM install, quick build will be really fast
33# for most source file changes.
34COPY . ./
36# Optionally, specify how to launch the source code of your actor.
37# By default, Apify's base Docker images define the CMD instruction
38# that runs the source code using the command specified
39# in the "scripts.start" section of the package.json file.
40# In short, the instruction looks something like this:
41# CMD npm start


2    "title": "Input",
3    "type": "object",
4    "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
5    "schemaVersion": 1,
6    "properties": {
7        "requestListSources": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "URLs to start with",
11            "prefill": [
12                { "url": "" }
13            ],
14            "editor": "requestListSources",
15            "minItems": 1
16        },
17        "proxyConfiguration": {
18            "title": "Proxy configuration",
19            "type": "object",
20            "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
21            "prefill": { "useApifyProxy": false },
22            "default": {},
23            "editor": "proxy"
24        },
25        "handlePageTimeoutSecs": {
26            "title": "Page timeout",
27            "type": "integer",
28            "description": "Maximum time the scraper will spend processing one page.",
29            "minimum": 1,
30            "default": 60,
31            "maximum": 360,
32            "unit": "secs"
33        },
34        "useChrome": {
35            "title": "Use Chrome",
36            "type": "boolean",
37            "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
38            "default": false,
39            "groupCaption": "Browser masking options",
40            "groupDescription": "Settings that help mask as a real user and prevent scraper detection."
41        },
42        "useStealth": {
43            "title": "Use Stealth",
44            "type": "boolean",
45            "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
46            "default": false
47        }
48    },
49    "required": ["requestListSources"]


1const Apify = require('apify');
3Apify.main(async () => {
4    const input = await Apify.getValue('INPUT');
6    console.log(input);
8    const requestList = await Apify.openRequestList('my-list', input.requestListSources);
9    const launchPuppeteerOptions = Object.assign({}, input.proxyConfiguration);
11    if (input.useChrome) launchPuppeteerOptions.useChrome;
12    if (input.useStealth) launchPuppeteerOptions.stealth;
14    const handlePageFunction = async ({ request, response, page }) => {
15        if (request.userData.waitForSelector) {
16            await page.waitForSelector(request.userData.waitForSelector);
17        }
19        await Apify.pushData({
20            url: request.url,
21            finishedAt: new Date(),
22            html: await page.evaluate(() => document.body.outerHTML),
23            '#debug': Apify.utils.createRequestDebugInfo(request, response),
24            '#error': false,
25        });
26    };
28    const handleFailedRequestFunction = async ({ request }) => {
29        await Apify.pushData({
30            url: request.url,
31            finishedAt: new Date(),
32            '#debug': Apify.utils.createRequestDebugInfo(request),
33            '#error': true,
34        });
35    };
37    const crawlerOptions = {
38        requestList,
39        handlePageFunction,
40        handleFailedRequestFunction,
41        launchPuppeteerOptions,
42    };
44    if (input.handlePageTimeoutSecs) {
45        crawlerOptions.handlePageTimeoutSecs = input.handlePageTimeoutSecs;
46    }
48    const puppeteerCrawler = new Apify.PuppeteerCrawler(crawlerOptions);
49    await;


2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^0.14.15"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"