1
2import { Actor } from 'apify';
3import { PuppeteerCrawler, Dataset, log } from 'crawlee';
4import stealthPlugin from 'puppeteer-extra-plugin-stealth';
5import puppeteerExtra from 'puppeteer-extra';
6import { PuppeteerBlocker } from '@cliqz/adblocker-puppeteer';
7import fetch from 'cross-fetch';
8import autoconsent from '@duckduckgo/autoconsent';
9
10
11puppeteerExtra.use(stealthPlugin());
12
13await Actor.init();
14
15const input = await Actor.getInput();
16const startUrls = input?.startUrls || [{ url: input?.url }];
17
18const proxyConfiguration = await Actor.createProxyConfiguration();
19
20const crawler = new PuppeteerCrawler({
21 proxyConfiguration,
22 launchContext: {
23 useChrome: true,
24 launcher: puppeteerExtra,
25 launchOptions: {
26 args: [
27 '--disable-gpu',
28 '--no-sandbox',
29 '--disable-setuid-sandbox',
30 '--disable-dev-shm-usage',
31 '--autoplay-policy=no-user-gesture-required',
32 ],
33 },
34 },
35 async requestHandler({ request, page, enqueueLinks, log }) {
36 const {
37 task = 'screenshot',
38 format = 'png',
39 fullPage = false,
40 fullPageScroll = false,
41 fullPageScrollDuration = 400,
42 delay = 0,
43 selector = '',
44 waitForSelector = '',
45 removeSelectors = '',
46 removeCookieBanners = false,
47 imageQuality = 80,
48 } = input;
49
50 if (removeCookieBanners) {
51 const blocker = await PuppeteerBlocker.fromLists(fetch, [
52 'https://secure.fanboy.co.nz/fanboy-cookiemonster.txt',
53 ]);
54 await blocker.enableBlockingInPage(page);
55 await page.evaluateOnNewDocument(autoconsent.script);
56 await page.evaluate(() => {
57 document
58 .querySelectorAll('[role="dialog"], [id*="cookie"], [class*="cookie"]')
59 .forEach((el) => ((el).style.display = 'none'));
60 });
61 }
62
63 if (delay > 0) await page.waitForTimeout(delay * 1000);
64 if (waitForSelector) {
65 try {
66 await page.waitForSelector(waitForSelector, { timeout: 10000 });
67 } catch (e) {
68 log.warning(`Selector ${waitForSelector} not found.`);
69 }
70 }
71 if (removeSelectors) {
72 const selectors = removeSelectors.split(',');
73 await page.evaluate((sels) => {
74 sels.forEach((sel) => {
75 document.querySelectorAll(sel).forEach(el => el.remove());
76 });
77 }, selectors);
78 }
79
80 if (task === 'extract_html') {
81 const content = selector
82 ? await page.$eval(selector, (el) => el.outerHTML)
83 : await page.content();
84
85 await Actor.setValue('page.html', content, { contentType: 'text/html' });
86 return;
87 }
88
89 if (fullPage && fullPageScroll) {
90 const pageHeight = await page.evaluate(() => {
91 return Math.max(
92 document.body.scrollHeight,
93 document.documentElement.scrollHeight,
94 document.body.offsetHeight,
95 document.documentElement.offsetHeight,
96 document.body.clientHeight,
97 document.documentElement.clientHeight
98 );
99 });
100 const steps = 20;
101 const stepSize = pageHeight / steps;
102 const stepDelay = fullPageScrollDuration / steps;
103
104 for (let i = 0; i <= steps; i++) {
105 await page.evaluate((scrollTo) => window.scrollTo(0, scrollTo), i * stepSize);
106 await page.waitForTimeout(stepDelay);
107 }
108
109 await page.waitForTimeout(500);
110 }
111
112 const qualityOptions = {};
113 if (format === 'jpeg' || format === 'webp') {
114 qualityOptions.quality = imageQuality;
115 }
116
117 let buffer;
118
119 if (format === 'pdf') {
120 buffer = await page.pdf({ format: 'A4', printBackground: true });
121 } else if (selector) {
122 const element = await page.$(selector);
123 if (!element) throw new Error(`Selector "${selector}" not found.`);
124 buffer = await element.screenshot({ type: format, ...qualityOptions });
125 } else {
126 buffer = await page.screenshot({ type: format, fullPage, ...qualityOptions });
127 }
128
129 await Actor.setValue(`screenshot.${format}`, buffer, {
130 contentType: format === 'pdf' ? 'application/pdf' : `image/${format}`,
131 });
132 },
133});
134
135await crawler.run(startUrls);
136
137await Actor.exit();