
Download HTML from URLs
Pricing
Pay per usage
Go to Apify Store

Download HTML from URLs
This actor takes a list of URLs and downloads HTML of each page.
0.0 (0)
Pricing
Pay per usage
27
8.8K
92
Last modified
7 days ago
Pricing
Pay per usage
This actor takes a list of URLs and downloads HTML of each page.
0.0 (0)
Pricing
Pay per usage
27
8.8K
92
Last modified
7 days ago
FROM apify/actor-node-puppeteer-chrome:16
COPY package*.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
COPY . ./
ENV APIFY_DISABLE_OUTDATED_WARNING 1ENV npm_config_loglevel=silent
1const Apify = require('apify');2
3Apify.main(async () => {4 const input = await Apify.getInput();5 6 const requestList = await Apify.openRequestList('my-list', input.requestListSources);7 const proxyConfiguration = await Apify.createProxyConfiguration(input.proxyConfiguration);8
9 const handlePageFunction = async ({ request, response, page }) => {10 const { waitForSelector } = request.userData;11
12 if (waitForSelector) {13 await page.waitForSelector(waitForSelector);14 }15 16 await Apify.pushData({17 url: request.url,18 finishedAt: new Date(),19 fullHtml: await page.content(),20 html: await page.evaluate(() => document.body.outerHTML),21 '#debug': Apify.utils.createRequestDebugInfo(request, response),22 '#error': false,23 });24 };25 26 const handleFailedRequestFunction = async ({ request }) => {27 await Apify.pushData({28 url: request.url,29 finishedAt: new Date(),30 '#debug': Apify.utils.createRequestDebugInfo(request),31 '#error': true,32 });33 };34
35 const puppeteerCrawler = new Apify.PuppeteerCrawler({36 requestList,37 handlePageFunction,38 handleFailedRequestFunction,39 proxyConfiguration,40 useSessionPool: true,41 sessionPoolOptions: {42 sessionOptions: {43 maxErrorScore: 0.5,44 },45 },46 browserPoolOptions: {47 useFingerprints: true,48 retireBrowserAfterPageCount: 1,49 maxOpenPagesPerBrowser: 1, // required to use one IP per tab50 },51 persistCookiesPerSession: false,52 maxRequestRetries: typeof input.maxRequestRetries === 'number' ? input.maxRequestRetries : 1,53 handlePageTimeoutSecs: input.handlePageTimeoutSecs || 60,54 launchContext: {55 useChrome: input.useChrome || false,56 launchOptions: {57 headless: false,58 ignoreHTTPSErrors: true,59 args: ['--ignore-certificate-errors']60 }61 },62 });63 64 await puppeteerCrawler.run();65});
{ "name": "my-actor", "version": "0.0.1", "dependencies": { "apify": "^2.3.2", "puppeteer": "^13" }, "scripts": { "start": "node main.js" }, "author": "Me!"}