Web Screenshot + HTML Extractor Actor avatar
Web Screenshot + HTML Extractor Actor

Deprecated

Pricing

$5.00 / 1,000 screenshots

Go to Store
Web Screenshot + HTML Extractor Actor

Web Screenshot + HTML Extractor Actor

Deprecated

Developed by

Jonathan Geiger

Jonathan Geiger

Maintained by Community

0.0 (0)

Pricing

$5.00 / 1,000 screenshots

0

Total users

2

Monthly users

2

Runs succeeded

>99%

Last modified

a month ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor",
"title": "Project Puppeteer Crawler JavaScript",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Screenshot & HTML Extractor",
"description": "Take screenshots or extract HTML from web pages using Puppeteer.",
"type": "object",
"schemaVersion": 1,
"properties": {
"url": {
"type": "string",
"title": "Target URL",
"description": "The URL of the page you want to screenshot or extract HTML from.",
"editor": "textfield"
},
"startUrls": {
"type": "array",
"title": "Start URLs",
"description": "Alternative to 'url' input. Use this if you want to run the Actor on multiple pages.",
"editor": "requestListSources"
},
"task": {
"type": "string",
"title": "Task Type",
"description": "Choose whether to take a screenshot or extract HTML.",
"enum": ["screenshot", "extract_html"],
"editor": "select",
"default": "screenshot"
},
"format": {
"type": "string",
"title": "Output Format",
"description": "The file format for the screenshot.",
"enum": ["png", "jpeg", "webp", "pdf"],
"editor": "select",
"default": "png"
},
"fullPage": {
"type": "boolean",
"title": "Full Page Screenshot",
"description": "Capture the full scroll height of the page.",
"default": false,
"editor": "checkbox"
},
"fullPageScroll": {
"type": "boolean",
"title": "Scroll Before Capture",
"description": "Scrolls down the page to trigger lazy-loaded content.",
"default": false,
"editor": "checkbox"
},
"fullPageScrollDuration": {
"type": "integer",
"title": "Scroll Duration (ms)",
"description": "How long to spend scrolling the page before taking the screenshot.",
"default": 400,
"minimum": 100,
"maximum": 10000,
"editor": "number"
},
"delay": {
"type": "integer",
"title": "Delay Before Capture (s)",
"description": "Wait for this many seconds before capturing.",
"default": 0,
"minimum": 0,
"maximum": 10,
"editor": "number"
},
"selector": {
"type": "string",
"title": "Capture Specific Selector",
"description": "CSS selector of an element to screenshot or extract HTML from.",
"default": "",
"editor": "textfield"
},
"waitForSelector": {
"type": "string",
"title": "Wait for Selector",
"description": "Wait for this CSS selector before continuing.",
"default": "",
"editor": "textfield"
},
"removeSelectors": {
"type": "string",
"title": "Remove Elements",
"description": "Comma-separated list of selectors to remove before capture.",
"default": "",
"editor": "textfield"
},
"removeCookieBanners": {
"type": "boolean",
"title": "Remove Cookie Banners",
"description": "Hide known cookie consent overlays using filters and scripts.",
"default": false,
"editor": "checkbox"
},
"imageQuality": {
"type": "integer",
"title": "Image Quality",
"description": "Only applies to JPEG and WebP formats (1–100).",
"default": 80,
"minimum": 1,
"maximum": 100,
"editor": "number"
}
},
"required": ["url"]
}

src/main.js

1// Apify SDK - toolkit for building Apify Actors
2import { Actor } from 'apify';
3import { PuppeteerCrawler, Dataset, log } from 'crawlee';
4import stealthPlugin from 'puppeteer-extra-plugin-stealth';
5import puppeteerExtra from 'puppeteer-extra';
6import { PuppeteerBlocker } from '@cliqz/adblocker-puppeteer';
7import fetch from 'cross-fetch';
8import autoconsent from '@duckduckgo/autoconsent';
9
10// Extend puppeteer with stealth
11puppeteerExtra.use(stealthPlugin());
12
13await Actor.init();
14
15const input = await Actor.getInput();
16const startUrls = input?.startUrls || [{ url: input?.url }];
17
18const proxyConfiguration = await Actor.createProxyConfiguration();
19
20const crawler = new PuppeteerCrawler({
21 proxyConfiguration,
22 launchContext: {
23 useChrome: true,
24 launcher: puppeteerExtra,
25 launchOptions: {
26 args: [
27 '--disable-gpu',
28 '--no-sandbox',
29 '--disable-setuid-sandbox',
30 '--disable-dev-shm-usage',
31 '--autoplay-policy=no-user-gesture-required',
32 ],
33 },
34 },
35 async requestHandler({ request, page, enqueueLinks, log }) {
36 const {
37 task = 'screenshot',
38 format = 'png',
39 fullPage = false,
40 fullPageScroll = false,
41 fullPageScrollDuration = 400,
42 delay = 0,
43 selector = '',
44 waitForSelector = '',
45 removeSelectors = '',
46 removeCookieBanners = false,
47 imageQuality = 80,
48 } = input;
49
50 if (removeCookieBanners) {
51 const blocker = await PuppeteerBlocker.fromLists(fetch, [
52 'https://secure.fanboy.co.nz/fanboy-cookiemonster.txt',
53 ]);
54 await blocker.enableBlockingInPage(page);
55 await page.evaluateOnNewDocument(autoconsent.script);
56 await page.evaluate(() => {
57 document
58 .querySelectorAll('[role="dialog"], [id*="cookie"], [class*="cookie"]')
59 .forEach((el) => ((el).style.display = 'none'));
60 });
61 }
62
63 if (delay > 0) await page.waitForTimeout(delay * 1000);
64 if (waitForSelector) {
65 try {
66 await page.waitForSelector(waitForSelector, { timeout: 10000 });
67 } catch (e) {
68 log.warning(`Selector ${waitForSelector} not found.`);
69 }
70 }
71 if (removeSelectors) {
72 const selectors = removeSelectors.split(',');
73 await page.evaluate((sels) => {
74 sels.forEach((sel) => {
75 document.querySelectorAll(sel).forEach(el => el.remove());
76 });
77 }, selectors);
78 }
79
80 if (task === 'extract_html') {
81 const content = selector
82 ? await page.$eval(selector, (el) => el.outerHTML)
83 : await page.content();
84
85 await Actor.setValue('page.html', content, { contentType: 'text/html' });
86 return;
87 }
88
89 if (fullPage && fullPageScroll) {
90 const pageHeight = await page.evaluate(() => {
91 return Math.max(
92 document.body.scrollHeight,
93 document.documentElement.scrollHeight,
94 document.body.offsetHeight,
95 document.documentElement.offsetHeight,
96 document.body.clientHeight,
97 document.documentElement.clientHeight
98 );
99 });
100 const steps = 20;
101 const stepSize = pageHeight / steps;
102 const stepDelay = fullPageScrollDuration / steps;
103
104 for (let i = 0; i <= steps; i++) {
105 await page.evaluate((scrollTo) => window.scrollTo(0, scrollTo), i * stepSize);
106 await page.waitForTimeout(stepDelay);
107 }
108
109 await page.waitForTimeout(500);
110 }
111
112 const qualityOptions = {};
113 if (format === 'jpeg' || format === 'webp') {
114 qualityOptions.quality = imageQuality;
115 }
116
117 let buffer;
118
119 if (format === 'pdf') {
120 buffer = await page.pdf({ format: 'A4', printBackground: true });
121 } else if (selector) {
122 const element = await page.$(selector);
123 if (!element) throw new Error(`Selector "${selector}" not found.`);
124 buffer = await element.screenshot({ type: format, ...qualityOptions });
125 } else {
126 buffer = await page.screenshot({ type: format, fullPage, ...qualityOptions });
127 }
128
129 await Actor.setValue(`screenshot.${format}`, buffer, {
130 contentType: format === 'pdf' ? 'application/pdf' : `image/${format}`,
131 });
132 },
133});
134
135await crawler.run(startUrls);
136
137await Actor.exit();

src/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
.zed
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-puppeteer-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.2.6",
"crawlee": "^3.11.5",
"puppeteer": "*",
"@cliqz/adblocker-puppeteer": "^1.34.0",
"@duckduckgo/autoconsent": "^12.9.0",
"cross-fetch": "^4.1.0",
"puppeteer-extra": "^3.3.4",
"puppeteer-extra-plugin-stealth": "^2.11.1"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}