Web Screenshot + HTML Extractor Actor
Deprecated
Pricing
$5.00 / 1,000 screenshots
Go to Store
Web Screenshot + HTML Extractor Actor
Deprecated
0.0 (0)
Pricing
$5.00 / 1,000 screenshots
0
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
a month ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Project Puppeteer Crawler JavaScript", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Screenshot & HTML Extractor", "description": "Take screenshots or extract HTML from web pages using Puppeteer.", "type": "object", "schemaVersion": 1, "properties": { "url": { "type": "string", "title": "Target URL", "description": "The URL of the page you want to screenshot or extract HTML from.", "editor": "textfield" }, "startUrls": { "type": "array", "title": "Start URLs", "description": "Alternative to 'url' input. Use this if you want to run the Actor on multiple pages.", "editor": "requestListSources" }, "task": { "type": "string", "title": "Task Type", "description": "Choose whether to take a screenshot or extract HTML.", "enum": ["screenshot", "extract_html"], "editor": "select", "default": "screenshot" }, "format": { "type": "string", "title": "Output Format", "description": "The file format for the screenshot.", "enum": ["png", "jpeg", "webp", "pdf"], "editor": "select", "default": "png" }, "fullPage": { "type": "boolean", "title": "Full Page Screenshot", "description": "Capture the full scroll height of the page.", "default": false, "editor": "checkbox" }, "fullPageScroll": { "type": "boolean", "title": "Scroll Before Capture", "description": "Scrolls down the page to trigger lazy-loaded content.", "default": false, "editor": "checkbox" }, "fullPageScrollDuration": { "type": "integer", "title": "Scroll Duration (ms)", "description": "How long to spend scrolling the page before taking the screenshot.", "default": 400, "minimum": 100, "maximum": 10000, "editor": "number" }, "delay": { "type": "integer", "title": "Delay Before Capture (s)", "description": "Wait for this many seconds before capturing.", "default": 0, "minimum": 0, "maximum": 10, "editor": "number" }, "selector": { "type": "string", "title": "Capture Specific Selector", "description": "CSS selector of an element to screenshot or extract HTML from.", "default": "", "editor": "textfield" }, "waitForSelector": { "type": "string", "title": "Wait for Selector", "description": "Wait for this CSS selector before continuing.", "default": "", "editor": "textfield" }, "removeSelectors": { "type": "string", "title": "Remove Elements", "description": "Comma-separated list of selectors to remove before capture.", "default": "", "editor": "textfield" }, "removeCookieBanners": { "type": "boolean", "title": "Remove Cookie Banners", "description": "Hide known cookie consent overlays using filters and scripts.", "default": false, "editor": "checkbox" }, "imageQuality": { "type": "integer", "title": "Image Quality", "description": "Only applies to JPEG and WebP formats (1–100).", "default": 80, "minimum": 1, "maximum": 100, "editor": "number" } }, "required": ["url"]}
src/main.js
1// Apify SDK - toolkit for building Apify Actors2import { Actor } from 'apify';3import { PuppeteerCrawler, Dataset, log } from 'crawlee';4import stealthPlugin from 'puppeteer-extra-plugin-stealth';5import puppeteerExtra from 'puppeteer-extra';6import { PuppeteerBlocker } from '@cliqz/adblocker-puppeteer';7import fetch from 'cross-fetch';8import autoconsent from '@duckduckgo/autoconsent';9
10// Extend puppeteer with stealth11puppeteerExtra.use(stealthPlugin());12
13await Actor.init();14
15const input = await Actor.getInput();16const startUrls = input?.startUrls || [{ url: input?.url }];17
18const proxyConfiguration = await Actor.createProxyConfiguration();19
20const crawler = new PuppeteerCrawler({21 proxyConfiguration,22 launchContext: {23 useChrome: true,24 launcher: puppeteerExtra,25 launchOptions: {26 args: [27 '--disable-gpu',28 '--no-sandbox',29 '--disable-setuid-sandbox',30 '--disable-dev-shm-usage',31 '--autoplay-policy=no-user-gesture-required',32 ],33 },34 },35 async requestHandler({ request, page, enqueueLinks, log }) {36 const {37 task = 'screenshot',38 format = 'png',39 fullPage = false,40 fullPageScroll = false,41 fullPageScrollDuration = 400,42 delay = 0,43 selector = '',44 waitForSelector = '',45 removeSelectors = '',46 removeCookieBanners = false,47 imageQuality = 80,48 } = input;49
50 if (removeCookieBanners) {51 const blocker = await PuppeteerBlocker.fromLists(fetch, [52 'https://secure.fanboy.co.nz/fanboy-cookiemonster.txt',53 ]);54 await blocker.enableBlockingInPage(page);55 await page.evaluateOnNewDocument(autoconsent.script);56 await page.evaluate(() => {57 document58 .querySelectorAll('[role="dialog"], [id*="cookie"], [class*="cookie"]')59 .forEach((el) => ((el).style.display = 'none'));60 });61 }62
63 if (delay > 0) await page.waitForTimeout(delay * 1000);64 if (waitForSelector) {65 try {66 await page.waitForSelector(waitForSelector, { timeout: 10000 });67 } catch (e) {68 log.warning(`Selector ${waitForSelector} not found.`);69 }70 }71 if (removeSelectors) {72 const selectors = removeSelectors.split(',');73 await page.evaluate((sels) => {74 sels.forEach((sel) => {75 document.querySelectorAll(sel).forEach(el => el.remove());76 });77 }, selectors);78 }79
80 if (task === 'extract_html') {81 const content = selector82 ? await page.$eval(selector, (el) => el.outerHTML)83 : await page.content();84 85 await Actor.setValue('page.html', content, { contentType: 'text/html' });86 return;87 }88
89 if (fullPage && fullPageScroll) {90 const pageHeight = await page.evaluate(() => {91 return Math.max(92 document.body.scrollHeight,93 document.documentElement.scrollHeight,94 document.body.offsetHeight,95 document.documentElement.offsetHeight,96 document.body.clientHeight,97 document.documentElement.clientHeight98 );99 });100 const steps = 20;101 const stepSize = pageHeight / steps;102 const stepDelay = fullPageScrollDuration / steps;103
104 for (let i = 0; i <= steps; i++) {105 await page.evaluate((scrollTo) => window.scrollTo(0, scrollTo), i * stepSize);106 await page.waitForTimeout(stepDelay);107 }108
109 await page.waitForTimeout(500);110 }111
112 const qualityOptions = {};113 if (format === 'jpeg' || format === 'webp') {114 qualityOptions.quality = imageQuality;115 }116
117 let buffer;118
119 if (format === 'pdf') {120 buffer = await page.pdf({ format: 'A4', printBackground: true });121 } else if (selector) {122 const element = await page.$(selector);123 if (!element) throw new Error(`Selector "${selector}" not found.`);124 buffer = await element.screenshot({ type: format, ...qualityOptions });125 } else {126 buffer = await page.screenshot({ type: format, fullPage, ...qualityOptions });127 }128
129 await Actor.setValue(`screenshot.${format}`, buffer, {130 contentType: format === 'pdf' ? 'application/pdf' : `image/${format}`,131 });132 },133});134
135await crawler.run(startUrls);136
137await Actor.exit();
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://apify.com/*'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 const title = await page.title();15 log.info(`${title}`, { url: request.loadedUrl });16
17 await Dataset.pushData({18 url: request.loadedUrl,19 title,20 });21});
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.idea.zeddistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-puppeteer-javascript", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "dependencies": { "apify": "^3.2.6", "crawlee": "^3.11.5", "puppeteer": "*", "@cliqz/adblocker-puppeteer": "^1.34.0", "@duckduckgo/autoconsent": "^12.9.0", "cross-fetch": "^4.1.0","puppeteer-extra": "^3.3.4", "puppeteer-extra-plugin-stealth": "^2.11.1" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0" }, "scripts": { "start": "node src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}