Download HTML from URLs avatar
Download HTML from URLs

Pricing

Pay per usage

Go to Store
Download HTML from URLs

Download HTML from URLs

Developed by

Marek Trunkát

Maintained by Community

This actor takes a list of URLs and downloads HTML of each page.

0.0 (0)

Pricing

Pay per usage

18

Monthly users

47

Runs succeeded

>99%

Last modified

a year ago

Dockerfile

1# Dockerfile contains instructions how to build a Docker image that
2# will contain all the code and configuration needed to run your actor.
3# For a full Dockerfile reference,
4# see https://docs.docker.com/engine/reference/builder/
5
6# First, specify the base Docker image. Apify provides the following
7# base images for your convenience:
8#  apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast)
9#  apify/actor-node-chrome (Node.js 10 + Chrome on Debian)
10#  apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)
11# For more information, see https://apify.com/docs/actor#base-images
12# Note that you can use any other image from Docker Hub.
13FROM apify/actor-node-chrome
14
15# Second, copy just package.json since it should be the only file
16# that affects NPM install in the next step
17COPY package.json ./
18
19# Install NPM packages, skip optional and development dependencies to
20# keep the image small. Avoid logging too much and print the dependency
21# tree for debugging
22RUN npm --quiet set progress=false \
23 && npm install --only=prod --no-optional \
24 && echo "Installed NPM packages:" \
25 && npm list \
26 && echo "Node.js version:" \
27 && node --version \
28 && echo "NPM version:" \
29 && npm --version
30
31# Next, copy the remaining files and directories with the source code.
32# Since we do this after NPM install, quick build will be really fast
33# for most source file changes.
34COPY . ./
35
36# Optionally, specify how to launch the source code of your actor.
37# By default, Apify's base Docker images define the CMD instruction
38# that runs the source code using the command specified
39# in the "scripts.start" section of the package.json file.
40# In short, the instruction looks something like this:
41# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input",
3    "type": "object",
4    "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
5    "schemaVersion": 1,
6    "properties": {
7        "requestListSources": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "URLs to start with",
11            "prefill": [
12                { "url": "https://apify.com" }
13            ],
14            "editor": "requestListSources",
15            "minItems": 1
16        },
17        "proxyConfiguration": {
18            "title": "Proxy configuration",
19            "type": "object",
20            "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
21            "prefill": { "useApifyProxy": false },
22            "default": {},
23            "editor": "proxy"
24        },
25        "handlePageTimeoutSecs": {
26            "title": "Page timeout",
27            "type": "integer",
28            "description": "Maximum time the scraper will spend processing one page.",
29            "minimum": 1,
30            "default": 60,
31            "maximum": 360,
32            "unit": "secs"
33        },
34        "useChrome": {
35            "title": "Use Chrome",
36            "type": "boolean",
37            "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
38            "default": false,
39            "groupCaption": "Browser masking options",
40            "groupDescription": "Settings that help mask as a real user and prevent scraper detection."
41        },
42        "useStealth": {
43            "title": "Use Stealth",
44            "type": "boolean",
45            "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
46            "default": false
47        }
48    },
49    "required": ["requestListSources"]
50}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4    const input = await Apify.getValue('INPUT');
5 
6    console.log(input);
7
8    const requestList = await Apify.openRequestList('my-list', input.requestListSources);
9    const launchPuppeteerOptions = Object.assign({}, input.proxyConfiguration);
10
11    if (input.useChrome) launchPuppeteerOptions.useChrome;
12    if (input.useStealth) launchPuppeteerOptions.stealth;
13
14    const handlePageFunction = async ({ request, response, page }) => {
15        if (request.userData.waitForSelector) {
16            await page.waitForSelector(request.userData.waitForSelector);
17        }
18    
19        await Apify.pushData({
20            url: request.url,
21            finishedAt: new Date(),
22            html: await page.evaluate(() => document.body.outerHTML),
23            '#debug': Apify.utils.createRequestDebugInfo(request, response),
24            '#error': false,
25        });
26    };
27    
28    const handleFailedRequestFunction = async ({ request }) => {
29        await Apify.pushData({
30            url: request.url,
31            finishedAt: new Date(),
32            '#debug': Apify.utils.createRequestDebugInfo(request),
33            '#error': true,
34        });
35    };
36
37    const crawlerOptions = {
38        requestList,
39        handlePageFunction,
40        handleFailedRequestFunction,
41        launchPuppeteerOptions,
42    };
43
44    if (input.handlePageTimeoutSecs) {
45        crawlerOptions.handlePageTimeoutSecs = input.handlePageTimeoutSecs;
46    }
47
48    const puppeteerCrawler = new Apify.PuppeteerCrawler(crawlerOptions);
49    await puppeteerCrawler.run();
50});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^0.14.15"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.