
Download HTML from URLs
Pricing
Pay per usage
Go to Store

Download HTML from URLs
This actor takes a list of URLs and downloads HTML of each page.
0.0 (0)
Pricing
Pay per usage
19
Total users
8.4k
Monthly users
52
Runs succeeded
>99%
Last modified
a year ago
Dockerfile
# Dockerfile contains instructions how to build a Docker image that# will contain all the code and configuration needed to run your actor.# For a full Dockerfile reference,# see https://docs.docker.com/engine/reference/builder/
# First, specify the base Docker image. Apify provides the following# base images for your convenience:# apify/actor-node-basic (Node.js 10 on Alpine Linux, small and fast)# apify/actor-node-chrome (Node.js 10 + Chrome on Debian)# apify/actor-node-chrome-xvfb (Node.js 10 + Chrome + Xvfb on Debian)# For more information, see https://apify.com/docs/actor#base-images# Note that you can use any other image from Docker Hub.FROM apify/actor-node-chrome
# Second, copy just package.json since it should be the only file# that affects NPM install in the next stepCOPY package.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && npm list \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Optionally, specify how to launch the source code of your actor.# By default, Apify's base Docker images define the CMD instruction# that runs the source code using the command specified# in the "scripts.start" section of the package.json file.# In short, the instruction looks something like this:# CMD npm start
INPUT_SCHEMA.json
{ "title": "Input", "type": "object", "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.", "schemaVersion": 1, "properties": { "requestListSources": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources", "minItems": 1 }, "proxyConfiguration": { "title": "Proxy configuration", "type": "object", "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.", "prefill": { "useApifyProxy": false }, "default": {}, "editor": "proxy" }, "handlePageTimeoutSecs": { "title": "Page timeout", "type": "integer", "description": "Maximum time the scraper will spend processing one page.", "minimum": 1, "default": 60, "maximum": 360, "unit": "secs" }, "useChrome": { "title": "Use Chrome", "type": "boolean", "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.", "default": false, "groupCaption": "Browser masking options", "groupDescription": "Settings that help mask as a real user and prevent scraper detection." }, "useStealth": { "title": "Use Stealth", "type": "boolean", "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.", "default": false } }, "required": ["requestListSources"]}
main.js
1const Apify = require('apify');2
3Apify.main(async () => {4 const input = await Apify.getValue('INPUT');5 6 console.log(input);7
8 const requestList = await Apify.openRequestList('my-list', input.requestListSources);9 const launchPuppeteerOptions = Object.assign({}, input.proxyConfiguration);10
11 if (input.useChrome) launchPuppeteerOptions.useChrome;12 if (input.useStealth) launchPuppeteerOptions.stealth;13
14 const handlePageFunction = async ({ request, response, page }) => {15 if (request.userData.waitForSelector) {16 await page.waitForSelector(request.userData.waitForSelector);17 }18 19 await Apify.pushData({20 url: request.url,21 finishedAt: new Date(),22 html: await page.evaluate(() => document.body.outerHTML),23 '#debug': Apify.utils.createRequestDebugInfo(request, response),24 '#error': false,25 });26 };27 28 const handleFailedRequestFunction = async ({ request }) => {29 await Apify.pushData({30 url: request.url,31 finishedAt: new Date(),32 '#debug': Apify.utils.createRequestDebugInfo(request),33 '#error': true,34 });35 };36
37 const crawlerOptions = {38 requestList,39 handlePageFunction,40 handleFailedRequestFunction,41 launchPuppeteerOptions,42 };43
44 if (input.handlePageTimeoutSecs) {45 crawlerOptions.handlePageTimeoutSecs = input.handlePageTimeoutSecs;46 }47
48 const puppeteerCrawler = new Apify.PuppeteerCrawler(crawlerOptions);49 await puppeteerCrawler.run();50});
package.json
{ "name": "my-actor", "version": "0.0.1", "dependencies": { "apify": "^0.14.15" }, "scripts": { "start": "node main.js" }, "author": "Me!"}