Download HTML from URLs
Try for free
No credit card required
View all Actors
Download HTML from URLs
mtrunkat/url-list-download-html
Try for free
No credit card required
This actor takes a list of URLs and downloads HTML of each page.
Dockerfile
1FROM apify/actor-node-puppeteer-chrome:16
2
3COPY package*.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional \
7 && echo "Installed NPM packages:" \
8 && (npm list --all || true) \
9 && echo "Node.js version:" \
10 && node --version \
11 && echo "NPM version:" \
12 && npm --version
13
14COPY . ./
15
16ENV APIFY_DISABLE_OUTDATED_WARNING 1
17ENV npm_config_loglevel=silent
INPUT_SCHEMA.json
1{
2 "title": "Input",
3 "type": "object",
4 "description": "Use the following form to configure this scraper. The URL list is required and all other fields are optional.",
5 "schemaVersion": 1,
6 "properties": {
7 "requestListSources": {
8 "title": "Start URLs",
9 "type": "array",
10 "description": "URLs to start with",
11 "prefill": [
12 { "url": "https://apify.com" }
13 ],
14 "editor": "requestListSources",
15 "minItems": 1
16 },
17 "proxyConfiguration": {
18 "title": "Proxy configuration",
19 "type": "object",
20 "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
21 "prefill": { "useApifyProxy": true },
22 "default": {},
23 "editor": "proxy"
24 },
25 "handlePageTimeoutSecs": {
26 "title": "Page timeout",
27 "type": "integer",
28 "description": "Maximum time the scraper will spend processing one page.",
29 "minimum": 1,
30 "default": 60,
31 "maximum": 360,
32 "unit": "secs"
33 },
34 "maxRequestRetries": {
35 "title": "Maximum request retries",
36 "description": "How many retries before giving up.",
37 "default": 1,
38 "prefill": 1,
39 "type": "integer",
40 "editor": "number"
41 },
42 "useChrome": {
43 "title": "Use Chrome",
44 "type": "boolean",
45 "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
46 "default": false,
47 "groupCaption": "Browser masking options",
48 "groupDescription": "Settings that help mask as a real user and prevent scraper detection."
49 }
50 },
51 "required": ["requestListSources", "proxyConfiguration"]
52}
main.js
1const Apify = require('apify');
2
3Apify.main(async () => {
4 const input = await Apify.getInput();
5
6 const requestList = await Apify.openRequestList('my-list', input.requestListSources);
7 const proxyConfiguration = await Apify.createProxyConfiguration(input.proxyConfiguration);
8
9 const handlePageFunction = async ({ request, response, page }) => {
10 const { waitForSelector } = request.userData;
11
12 if (waitForSelector) {
13 await page.waitForSelector(waitForSelector);
14 }
15
16 await Apify.pushData({
17 url: request.url,
18 finishedAt: new Date(),
19 fullHtml: await page.content(),
20 html: await page.evaluate(() => document.body.outerHTML),
21 '#debug': Apify.utils.createRequestDebugInfo(request, response),
22 '#error': false,
23 });
24 };
25
26 const handleFailedRequestFunction = async ({ request }) => {
27 await Apify.pushData({
28 url: request.url,
29 finishedAt: new Date(),
30 '#debug': Apify.utils.createRequestDebugInfo(request),
31 '#error': true,
32 });
33 };
34
35 const puppeteerCrawler = new Apify.PuppeteerCrawler({
36 requestList,
37 handlePageFunction,
38 handleFailedRequestFunction,
39 proxyConfiguration,
40 useSessionPool: true,
41 sessionPoolOptions: {
42 sessionOptions: {
43 maxErrorScore: 0.5,
44 },
45 },
46 browserPoolOptions: {
47 useFingerprints: true,
48 retireBrowserAfterPageCount: 1,
49 maxOpenPagesPerBrowser: 1, // required to use one IP per tab
50 },
51 persistCookiesPerSession: false,
52 maxRequestRetries: typeof input.maxRequestRetries === 'number' ? input.maxRequestRetries : 1,
53 handlePageTimeoutSecs: input.handlePageTimeoutSecs || 60,
54 launchContext: {
55 useChrome: input.useChrome || false,
56 launchOptions: {
57 headless: false,
58 ignoreHTTPSErrors: true,
59 args: ['--ignore-certificate-errors']
60 }
61 },
62 });
63
64 await puppeteerCrawler.run();
65});
package.json
1{
2 "name": "my-actor",
3 "version": "0.0.1",
4 "dependencies": {
5 "apify": "^2.3.2",
6 "puppeteer": "^13"
7 },
8 "scripts": {
9 "start": "node main.js"
10 },
11 "author": "Me!"
12}
Developer
Maintained by Community
Actor metrics
- 51 monthly users
- 99.9% runs succeeded
- 11.1 days response time
- Created in Feb 2018
- Modified 10 days ago
Categories