Download HTML from URLs avatar
Download HTML from URLs

Pricing

Pay per usage

Go to Store
Download HTML from URLs

Download HTML from URLs

Developed by

Marek Trunkát

Marek Trunkát

Maintained by Community

This actor takes a list of URLs and downloads HTML of each page.

0.0 (0)

Pricing

Pay per usage

19

Total users

8.4k

Monthly users

52

Runs succeeded

>99%

Last modified

a year ago

Dockerfile

# Dockerfile contains instructions how to build a Docker image that
# will contain all the code and configuration needed to run your actor.
# For a full Dockerfile reference,
# see https://docs.docker.com/engine/reference/builder/
# First, specify the base Docker image. Apify provides the following
# base images for your convenience:
# apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast)
# apify/actor-node-chrome (Node.js 10 + Chrome on Debian)
# apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)
# For more information, see https://apify.com/docs/actor#base-images
# Note that you can use any other image from Docker Hub.
FROM apify/actor-node-chrome
# Second, copy just package.json since it should be the only file
# that affects NPM install in the next step
COPY package.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& npm list \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
# CMD npm start

INPUT_SCHEMA.json

{
"title": "Input",
"type": "object",
"description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
"schemaVersion": 1,
"properties": {
"requestListSources": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with",
"prefill": [
{ "url": "https://apify.com" }
],
"editor": "requestListSources",
"minItems": 1
},
"proxyConfiguration": {
"title": "Proxy configuration",
"type": "object",
"description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
"prefill": { "useApifyProxy": false },
"default": {},
"editor": "proxy"
},
"handlePageTimeoutSecs": {
"title": "Page timeout",
"type": "integer",
"description": "Maximum time the scraper will spend processing one page.",
"minimum": 1,
"default": 60,
"maximum": 360,
"unit": "secs"
},
"useChrome": {
"title": "Use Chrome",
"type": "boolean",
"description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
"default": false,
"groupCaption": "Browser masking options",
"groupDescription": "Settings that help mask as a real user and prevent scraper detection."
},
"useStealth": {
"title": "Use Stealth",
"type": "boolean",
"description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
"default": false
}
},
"required": ["requestListSources"]
}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4 const input = await Apify.getValue('INPUT');
5
6 console.log(input);
7
8 const requestList = await Apify.openRequestList('my-list', input.requestListSources);
9 const launchPuppeteerOptions = Object.assign({}, input.proxyConfiguration);
10
11 if (input.useChrome) launchPuppeteerOptions.useChrome;
12 if (input.useStealth) launchPuppeteerOptions.stealth;
13
14 const handlePageFunction = async ({ request, response, page }) => {
15 if (request.userData.waitForSelector) {
16 await page.waitForSelector(request.userData.waitForSelector);
17 }
18
19 await Apify.pushData({
20 url: request.url,
21 finishedAt: new Date(),
22 html: await page.evaluate(() => document.body.outerHTML),
23 '#debug': Apify.utils.createRequestDebugInfo(request, response),
24 '#error': false,
25 });
26 };
27
28 const handleFailedRequestFunction = async ({ request }) => {
29 await Apify.pushData({
30 url: request.url,
31 finishedAt: new Date(),
32 '#debug': Apify.utils.createRequestDebugInfo(request),
33 '#error': true,
34 });
35 };
36
37 const crawlerOptions = {
38 requestList,
39 handlePageFunction,
40 handleFailedRequestFunction,
41 launchPuppeteerOptions,
42 };
43
44 if (input.handlePageTimeoutSecs) {
45 crawlerOptions.handlePageTimeoutSecs = input.handlePageTimeoutSecs;
46 }
47
48 const puppeteerCrawler = new Apify.PuppeteerCrawler(crawlerOptions);
49 await puppeteerCrawler.run();
50});

package.json

{
"name": "my-actor",
"version": "0.0.1",
"dependencies": {
"apify": "^0.14.15"
},
"scripts": {
"start": "node main.js"
},
"author": "Me!"
}