1import { Actor } from 'apify';
2import { PuppeteerCrawler, RequestList } from 'crawlee';
3import { createRequestDebugInfo } from '@crawlee/utils';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8
9const requestList = await RequestList.open('my-list', input.requestListSources);
10const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
11
12const puppeteerCrawler = new PuppeteerCrawler({
13 requestList,
14 proxyConfiguration,
15 useSessionPool: true,
16 sessionPoolOptions: {
17 sessionOptions: {
18 maxErrorScore: 0.5,
19 },
20 },
21 browserPoolOptions: {
22 useFingerprints: true,
23 retireBrowserAfterPageCount: 1,
24 maxOpenPagesPerBrowser: 1,
25 preLaunchHooks: [
26 (pageId, launchContext) => {
27 if (input.useChrome) {
28 launchContext.useChrome = true;
29 }
30 },
31 ],
32 },
33 persistCookiesPerSession: false,
34 maxRequestRetries: typeof input.maxRequestRetries === 'number' ? input.maxRequestRetries : 1,
35 requestHandlerTimeoutSecs: input.handlePageTimeoutSecs || 60,
36 headless: false,
37 launchContext: {
38 launchOptions: {
39 ignoreHTTPSErrors: true,
40 args: ['--ignore-certificate-errors'],
41 },
42 },
43
44 async requestHandler({ request, response, page }) {
45 const { waitForSelector } = request.userData;
46
47 if (waitForSelector) {
48 await page.waitForSelector(waitForSelector);
49 }
50
51 await Actor.pushData({
52 url: request.url,
53 finishedAt: new Date().toISOString(),
54 fullHtml: await page.content(),
55 html: await page.evaluate(() => document.body.outerHTML),
56 '#debug': createRequestDebugInfo(request, response),
57 '#error': false,
58 });
59 },
60
61 async failedRequestHandler({ request }) {
62 await Actor.pushData({
63 url: request.url,
64 finishedAt: new Date().toISOString(),
65 '#debug': createRequestDebugInfo(request),
66 '#error': true,
67 });
68 },
69});
70
71await puppeteerCrawler.run();
72
73await Actor.exit();