Puppeteer Scraper avatar
Puppeteer Scraper

Pricing

Pay per usage

Go to Store
Puppeteer Scraper

Puppeteer Scraper

Developed by

Apify

Apify

Maintained by Apify

Crawls websites with the headless Chrome and Puppeteer library using a provided server-side Node.js code. This crawler is an alternative to apify/web-scraper that gives you finer control over the process. Supports both recursive crawling and list of URLs. Supports login to website.

5.0 (5)

Pricing

Pay per usage

139

Total users

6.7k

Monthly users

843

Runs succeeded

>99%

Issue response

3.7 days

Last modified

21 days ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules

CHANGELOG.md

1# Change Log
2
3## 3.0.12 (2024-10-25)
4
5- Updated Crawlee version to v3.11.5 and SDK v3.2.6
6- Updated Node to v22
7
8## 3.0.11 (2024-04-09)
9
10- Updated Crawlee version to v3.8.0.
11- Updated to use new request queue in scraper
12
13## 3.0.8 (2023-08-22)
14
15- Updated Crawlee version to v3.5.2.
16- Updated Node.js version to v18.
17- Added new options:
18 - **Dismiss cookie modals** (`closeCookieModals`): Using the [I don't care about cookies](https://addons.mozilla.org/en-US/firefox/addon/i-dont-care-about-cookies/) browser extension. When on, the crawler will automatically try to dismiss cookie consent modals. This can be useful when crawling European websites that show cookie consent modals.
19 - **Maximum scrolling distance in pixels** (`maxScrollHeightPixels`): The crawler will scroll down the page until all content is loaded or the maximum scrolling distance is reached. Setting this to `0` disables scrolling altogether.
20 - **Exclude Glob Patterns** (`excludes`): Glob patterns to match links in the page that you want to exclude from being enqueued.
21
22## 3.0 (`version-3`)
23
24- Rewrite from Apify SDK to Crawlee, see the [v3 migration guide](https://sdk.apify.com/docs/upgrading/upgrading-to-v3) for more details.
25- Proxy usage is now required.
26
27## 2.0 (`version-2`)
28
29Main difference between v1 and v2 of the scrapers is the upgrade of SDK to v2, which requires node v15.10+. SDK v2 uses http2 to do the requests with cheerio-scraper, and the http2 support in older node versions were too buggy, so we decided to drop support for those. If you need to run on older node version, use SDK v1.
30
31Please refer to the SDK 1.0 migration guide for more details about functional changes in the SDK. SDK v2 basically only changes the required node version and has no other breaking changes.
32
33- deprecated `useRequestQueue` option has been removed
34 - `RequestQueue` will be always used
35- deprecated `context.html` getter from the `cheerio-scraper` has been removed
36 - use `context.body` instead
37- deprecated prepareRequestFunction input option
38 - use `pre/postNavigationHooks` instead
39- removed `puppeteerPool/autoscaledPool` from the crawlingContext object
40 - `puppeteerPool` was replaced by `browserPool`
41 - `autoscaledPool` and `browserPool` and available on the `crawler` property of `crawlingContext` object
42- custom "Key-value store name" option in Advanced configuration is now fixed, previously the default store was always used

Dockerfile

FROM apify/actor-node-puppeteer-chrome:22 AS builder
COPY --chown=myuser package*.json ./
RUN npm install --include=dev --audit=false
COPY --chown=myuser . ./
RUN npm run build
FROM apify/actor-node-puppeteer-chrome:22
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
COPY --chown=myuser package*.json ./
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
COPY --chown=myuser . ./
ENV APIFY_DISABLE_OUTDATED_WARNING=1
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

INPUT_SCHEMA.json

{
"title": "Puppeteer Scraper Input",
"description": "Use the following form to configure Puppeteer Scraper. The Start URLs and Page function are required and all other fields are optional. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the Actor's README in the ACTOR INFO tab.",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"sectionCaption": "Basic configuration",
"title": "Start URLs",
"type": "array",
"description": "URLs to start with",
"prefill": [{ "url": "https://crawlee.dev" }],
"editor": "requestListSources"
},
"globs": {
"title": "Glob Patterns",
"type": "array",
"description": "Glob patterns to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Glob patterns will cause the scraper to enqueue all links matched by the Link selector.",
"editor": "globs",
"default": [],
"prefill": [
{
"glob": "https://crawlee.dev/*/*"
}
]
},
"pseudoUrls": {
"title": "Pseudo-URLs",
"type": "array",
"description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
"editor": "pseudoUrls",
"default": [],
"prefill": []
},
"excludes": {
"title": "Exclude Glob Patterns",
"type": "array",
"description": "Glob patterns to match links in the page that you want to exclude from being enqueued.",
"editor": "globs",
"default": [],
"prefill": [
{
"glob": "/**/*.{png,jpg,jpeg,pdf}"
}
]
},
"linkSelector": {
"title": "Link selector",
"type": "string",
"description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
"editor": "textfield",
"prefill": "a"
},
"clickableElementsSelector": {
"title": "Clickable elements selector",
"type": "string",
"description": "For pages where simple 'href' links are not available, this attribute allows you to specify a CSS selector matching elements that the scraper will mouse click after the page function finishes. Any triggered requests, navigations or open tabs will be intercepted and the target URLs will be filtered using Pseudo URLs and/or Glob patterns and subsequently added to the request queue. Leave empty to prevent the scraper from clicking in the page. Using this setting will have a performance impact.",
"editor": "textfield"
},
"keepUrlFragments": {
"title": "Keep URL fragments",
"type": "boolean",
"description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
"default": false
},
"respectRobotsTxtFile": {
"title": "Respect the robots.txt file",
"type": "boolean",
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
"default": false,
"prefill": true
},
"pageFunction": {
"title": "Page function",
"type": "string",
"description": "Function executed for each request",
"prefill": "async function pageFunction(context) {\n const { page, request, log } = context;\n const title = await page.title();\n log.info(`URL: ${request.url} TITLE: ${title}`);\n return {\n url: request.url,\n title\n };\n}",
"editor": "javascript"
},
"proxyConfiguration": {
"sectionCaption": "Proxy and browser configuration",
"title": "Proxy configuration",
"type": "object",
"description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/puppeteer-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
"prefill": { "useApifyProxy": true },
"default": { "useApifyProxy": true },
"editor": "proxy"
},
"proxyRotation": {
"title": "Proxy rotation",
"type": "string",
"description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.",
"default": "RECOMMENDED",
"editor": "select",
"enum": ["RECOMMENDED", "PER_REQUEST", "UNTIL_FAILURE"],
"enumTitles": [
"Use recommended settings",
"Rotate proxy after each request",
"Use one proxy until failure"
]
},
"sessionPoolName": {
"title": "Session pool name",
"type": "string",
"description": "<b>Use only english alphanumeric characters dashes and underscores.</b> A session is a representation of a user. It has it's own IP and cookies which are then used together to emulate a real user. Usage of the sessions is controlled by the Proxy rotation option. By providing a session pool name, you enable sharing of those sessions across multiple Actor runs. This is very useful when you need specific cookies for accessing the websites or when a lot of your proxies are already blocked. Instead of trying randomly, a list of working sessions will be saved and a new Actor run can reuse those sessions. Note that the IP lock on sessions expires after 24 hours, unless the session is used again in that window.",
"editor": "textfield",
"minLength": 3,
"maxLength": 200,
"pattern": "[0-9A-z-]"
},
"initialCookies": {
"title": "Initial cookies",
"type": "array",
"description": "The provided cookies will be pre-set to all pages the scraper opens.",
"default": [],
"prefill": [],
"editor": "json"
},
"useChrome": {
"title": "Use Chrome",
"type": "boolean",
"description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
"default": false,
"groupCaption": "Browser masking",
"groupDescription": "Settings that help mask as a real user and prevent scraper detection."
},
"headless": {
"title": "Run browsers in headless mode",
"type": "boolean",
"description": "By default, browsers run in headless mode. You can toggle this off to run them in headful mode, which can help with certain rare anti-scraping protections but is slower and more costly.",
"default": true
},
"ignoreSslErrors": {
"title": "Ignore SSL errors",
"type": "boolean",
"description": "Scraper will ignore SSL certificate errors.",
"default": false,
"groupCaption": "Security",
"groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS."
},
"ignoreCorsAndCsp": {
"title": "Ignore CORS and CSP",
"type": "boolean",
"description": "Scraper will ignore CSP (content security policy) and CORS (cross origin resource sharing) settings of visited pages and requested domains. This enables you to freely use XHR/Fetch to make HTTP requests from the scraper.",
"default": false
},
"downloadMedia": {
"sectionCaption": "Performance and limits",
"title": "Download media",
"type": "boolean",
"description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
"default": true,
"groupCaption": "Resources",
"groupDescription": "Settings that help to disable downloading web page resources."
},
"downloadCss": {
"title": "Download CSS",
"type": "boolean",
"description": "Scraper will download CSS stylesheets. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
"default": true
},
"maxRequestRetries": {
"title": "Max request retries",
"type": "integer",
"description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.",
"minimum": 0,
"default": 3,
"unit": "retries"
},
"maxPagesPerCrawl": {
"title": "Max pages per run",
"type": "integer",
"description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
"minimum": 0,
"default": 0,
"unit": "pages"
},
"maxResultsPerCrawl": {
"title": "Max result records",
"type": "integer",
"description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.",
"minimum": 0,
"default": 0,
"unit": "results"
},
"maxCrawlingDepth": {
"title": "Max crawling depth",
"type": "integer",
"description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.",
"minimum": 0,
"default": 0
},
"maxConcurrency": {
"title": "Max concurrency",
"type": "integer",
"description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.",
"minimum": 1,
"default": 50
},
"pageLoadTimeoutSecs": {
"title": "Page load timeout",
"type": "integer",
"description": "Maximum time the scraper will allow a web page to load in seconds.",
"minimum": 1,
"default": 60,
"unit": "secs"
},
"pageFunctionTimeoutSecs": {
"title": "Page function timeout",
"type": "integer",
"description": "Maximum time the scraper will wait for the page function to execute in seconds.",
"minimum": 1,
"default": 60,
"unit": "secs"
},
"waitUntil": {
"title": "Navigation wait until",
"type": "array",
"description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code>, <code>networkidle2</code> and <code>networkidle0</code>. <a href=\"https://pptr.dev/#?product=Puppeteer&show=api-pagegotourl-options\" target=\"_blank\">See Puppeteer docs</a>.",
"default": ["networkidle2"],
"prefill": ["networkidle2"],
"editor": "json"
},
"preNavigationHooks": {
"sectionCaption": "Advanced configuration",
"title": "Pre-navigation hooks",
"type": "string",
"description": "Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `page.goto()` function the crawler calls to navigate.",
"prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept two arguments: the \"crawlingContext\" object\n// and \"gotoOptions\".\n[\n async (crawlingContext, gotoOptions) => {\n const { page } = crawlingContext;\n // ...\n },\n]",
"editor": "javascript"
},
"postNavigationHooks": {
"title": "Post-navigation hooks",
"type": "string",
"description": "Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts `crawlingContext` as the only parameter.",
"prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept a single argument: the \"crawlingContext\" object.\n[\n async (crawlingContext) => {\n const { page } = crawlingContext;\n // ...\n },\n]",
"editor": "javascript"
},
"closeCookieModals": {
"title": "Dismiss cookie modals",
"type": "boolean",
"description": "Using the [I don't care about cookies](https://addons.mozilla.org/en-US/firefox/addon/i-dont-care-about-cookies/) browser extension. When on, the crawler will automatically try to dismiss cookie consent modals. This can be useful when crawling European websites that show cookie consent modals.",
"default": false
},
"maxScrollHeightPixels": {
"title": "Maximum scrolling distance in pixels",
"type": "integer",
"description": "The crawler will scroll down the page until all content is loaded or the maximum scrolling distance is reached. Setting this to `0` disables scrolling altogether.",
"default": 5000
},
"debugLog": {
"title": "Debug log",
"type": "boolean",
"description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.",
"default": false,
"groupCaption": "Enable logs",
"groupDescription": "Logs settings"
},
"browserLog": {
"title": "Browser log",
"type": "boolean",
"description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
"default": false
},
"customData": {
"title": "Custom data",
"type": "object",
"description": "This object will be available on pageFunction's context as customData.",
"default": {},
"prefill": {},
"editor": "json"
},
"datasetName": {
"title": "Dataset name",
"type": "string",
"description": "Name or ID of the dataset that will be used for storing results. If left empty, the default dataset of the run will be used.",
"editor": "textfield"
},
"keyValueStoreName": {
"title": "Key-value store name",
"type": "string",
"description": "Name or ID of the key-value store that will be used for storing records. If left empty, the default key-value store of the run will be used.",
"editor": "textfield"
},
"requestQueueName": {
"title": "Request queue name",
"type": "string",
"description": "Name of the request queue that will be used for storing requests. If left empty, the default request queue of the run will be used.",
"editor": "textfield"
}
},
"required": ["startUrls", "pageFunction", "proxyConfiguration"]
}

package.json

{
"name": "actor-puppeteer-scraper",
"version": "3.1.0",
"private": true,
"description": "Crawl web pages using Apify, headless Chrome and Puppeteer",
"type": "module",
"dependencies": {
"@apify/scraper-tools": "^1.1.4",
"@crawlee/puppeteer": "^3.13.2",
"apify": "^3.2.6",
"idcac-playwright": "^0.1.3",
"puppeteer": "*"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/node": "^22.7.4",
"tsx": "^4.19.1",
"typescript": "~5.8.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc"
},
"repository": {
"type": "git",
"url": "https://github.com/apify/apify-sdk-js"
},
"author": {
"name": "Apify Technologies",
"email": "support@apify.com",
"url": "https://apify.com"
},
"contributors": [
"Marek Trunkat <marek@apify.com>",
"Ondra Urban <ondra@apify.com>"
],
"license": "Apache-2.0",
"homepage": "https://github.com/apify/apify-sdk-js"
}

tsconfig.json

{
"extends": "@apify/tsconfig",
"compilerOptions": {
"outDir": "dist",
"module": "ESNext",
"allowJs": true,
"skipLibCheck": true
},
"include": ["src"]
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "puppeteer-scraper",
"version": "0.1",
"buildTag": "latest"
}

src/main.ts

1import { runActor } from '@apify/scraper-tools';
2
3import { CrawlerSetup } from './internals/crawler_setup.js';
4
5runActor(CrawlerSetup);

src/internals/consts.ts

1import type {
2 GlobInput,
3 ProxyConfigurationOptions,
4 PseudoUrlInput,
5 RegExpInput,
6 RequestOptions,
7 Session,
8} from '@crawlee/puppeteer';
9import type { Dictionary } from '@crawlee/utils';
10import type { PuppeteerLifeCycleEvent } from 'puppeteer';
11
12export const enum ProxyRotation {
13 Recommended = 'RECOMMENDED',
14 PerRequest = 'PER_REQUEST',
15 UntilFailure = 'UNTIL_FAILURE',
16}
17
18/**
19 * Replicates the INPUT_SCHEMA with TypeScript types for quick reference
20 * and IDE type check integration.
21 */
22export interface Input {
23 startUrls: RequestOptions[];
24 globs: GlobInput[];
25 regexps: RegExpInput[];
26 pseudoUrls: PseudoUrlInput[];
27 excludes: GlobInput[];
28 linkSelector?: string;
29 clickableElementsSelector?: string;
30 keepUrlFragments: boolean;
31 respectRobotsTxtFile: boolean;
32 pageFunction: string;
33 preNavigationHooks?: string;
34 postNavigationHooks?: string;
35 proxyConfiguration: ProxyConfigurationOptions;
36 proxyRotation: ProxyRotation;
37 sessionPoolName?: string;
38 initialCookies: Parameters<Session['setCookies']>[0];
39 useChrome: boolean;
40 maxScrollHeightPixels: number;
41 ignoreSslErrors: boolean;
42 ignoreCorsAndCsp: boolean;
43 closeCookieModals: boolean;
44 downloadMedia: boolean;
45 downloadCss: boolean;
46 maxRequestRetries: number;
47 maxPagesPerCrawl: number;
48 maxResultsPerCrawl: number;
49 maxCrawlingDepth: number;
50 maxConcurrency: number;
51 pageLoadTimeoutSecs: number;
52 pageFunctionTimeoutSecs: number;
53 waitUntil: PuppeteerLifeCycleEvent[];
54 debugLog: boolean;
55 browserLog: boolean;
56 customData: Dictionary;
57 datasetName?: string;
58 keyValueStoreName?: string;
59 requestQueueName?: string;
60 headless: boolean;
61}

src/internals/crawler_setup.ts

1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksByClickingElementsOptions,
8 EnqueueLinksOptions,
9 ProxyConfiguration,
10 PuppeteerCrawlerOptions,
11 PuppeteerCrawlingContext,
12 Request,
13} from '@crawlee/puppeteer';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PuppeteerCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/puppeteer';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { HTTPResponse } from 'puppeteer';
28
29import type {
30 CrawlerSetupOptions,
31 RequestMetadata,
32} from '@apify/scraper-tools';
33import {
34 browserTools,
35 constants as scraperToolsConstants,
36 createContext,
37 tools,
38} from '@apify/scraper-tools';
39
40import type { Input } from './consts.js';
41import { ProxyRotation } from './consts.js';
42
43const SESSION_STORE_NAME = 'APIFY-PUPPETEER-SCRAPER-SESSION-STORE';
44
45const {
46 META_KEY,
47 DEFAULT_VIEWPORT,
48 DEVTOOLS_TIMEOUT_SECS,
49 SESSION_MAX_USAGE_COUNTS,
50} = scraperToolsConstants;
51const SCHEMA = JSON.parse(
52 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
53);
54
55/**
56 * Holds all the information necessary for constructing a crawler
57 * instance and creating a context for a pageFunction invocation.
58 */
59export class CrawlerSetup implements CrawlerSetupOptions {
60 name = 'Puppeteer Scraper';
61 rawInput: string;
62 env: ApifyEnv;
63 /**
64 * Used to store data that persist navigations
65 */
66 globalStore = new Map();
67 requestQueue: RequestQueueV2;
68 keyValueStore: KeyValueStore;
69 customData: unknown;
70 input: Input;
71 maxSessionUsageCount: number;
72 evaledPageFunction: (...args: unknown[]) => unknown;
73 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
74 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
75 blockedUrlPatterns: string[] = [];
76 devtools: boolean;
77 datasetName?: string;
78 keyValueStoreName?: string;
79 requestQueueName?: string;
80
81 crawler!: PuppeteerCrawler;
82 requestList!: RequestList;
83 dataset!: Dataset;
84 pagesOutputted!: number;
85 private initPromise: Promise<void>;
86
87 constructor(input: Input) {
88 // Set log level early to prevent missed messages.
89 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
90
91 // Keep this as string to be immutable.
92 this.rawInput = JSON.stringify(input);
93
94 // Attempt to load page function from disk if not present on input.
95 tools.maybeLoadPageFunctionFromDisk(
96 input,
97 dirname(fileURLToPath(import.meta.url)),
98 );
99
100 // Validate INPUT if not running on Apify Cloud Platform.
101 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
102
103 this.input = input;
104 this.env = Actor.getEnv();
105
106 // Validations
107 this.input.pseudoUrls.forEach((purl) => {
108 if (!tools.isPlainObject(purl)) {
109 throw new Error(
110 'The pseudoUrls Array must only contain Objects.',
111 );
112 }
113 if (purl.userData && !tools.isPlainObject(purl.userData)) {
114 throw new Error(
115 'The userData property of a pseudoUrl must be an Object.',
116 );
117 }
118 });
119
120 this.input.initialCookies.forEach((cookie) => {
121 if (!tools.isPlainObject(cookie)) {
122 throw new Error(
123 'The initialCookies Array must only contain Objects.',
124 );
125 }
126 });
127
128 this.input.waitUntil.forEach((event) => {
129 if (
130 !/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(
131 event,
132 )
133 ) {
134 throw new Error(
135 'Navigation wait until events must be valid. See tooltip.',
136 );
137 }
138 });
139
140 // solving proxy rotation settings
141 this.maxSessionUsageCount =
142 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
143
144 // Functions need to be evaluated.
145 this.evaledPageFunction = tools.evalFunctionOrThrow(
146 this.input.pageFunction,
147 );
148
149 if (this.input.preNavigationHooks) {
150 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
151 this.input.preNavigationHooks,
152 'preNavigationHooks',
153 );
154 } else {
155 this.evaledPreNavigationHooks = [];
156 }
157
158 if (this.input.postNavigationHooks) {
159 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
160 this.input.postNavigationHooks,
161 'postNavigationHooks',
162 );
163 } else {
164 this.evaledPostNavigationHooks = [];
165 }
166
167 // Excluded resources
168 this.blockedUrlPatterns = [];
169 if (!this.input.downloadMedia) {
170 this.blockedUrlPatterns = [
171 '.jpg',
172 '.jpeg',
173 '.png',
174 '.svg',
175 '.gif',
176 '.webp',
177 '.webm',
178 '.ico',
179 '.woff',
180 '.eot',
181 ];
182 }
183
184 if (!this.input.downloadCss) {
185 this.blockedUrlPatterns.push('.css');
186 }
187
188 // Start Chromium with Debugger any time the page function includes the keyword.
189 this.devtools = this.input.pageFunction.includes('debugger;');
190
191 // Named storages
192 this.datasetName = this.input.datasetName;
193 this.keyValueStoreName = this.input.keyValueStoreName;
194 this.requestQueueName = this.input.requestQueueName;
195
196 // Initialize async operations.
197 this.crawler = null!;
198 this.requestList = null!;
199 this.requestQueue = null!;
200 this.dataset = null!;
201 this.keyValueStore = null!;
202 this.initPromise = this._initializeAsync();
203 }
204
205 private async _initializeAsync() {
206 // RequestList
207 const startUrls = this.input.startUrls.map((req) => {
208 req.useExtendedUniqueKey = true;
209 req.keepUrlFragment = this.input.keepUrlFragments;
210 return req;
211 });
212
213 this.requestList = await RequestList.open(
214 'PUPPETEER_SCRAPER',
215 startUrls,
216 );
217
218 // RequestQueue
219 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
220
221 // Dataset
222 this.dataset = await Dataset.open(this.datasetName);
223 const info = await this.dataset.getInfo();
224 this.pagesOutputted = info?.itemCount ?? 0;
225
226 // KeyValueStore
227 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
228 }
229
230 /**
231 * Resolves to a `PuppeteerCrawler` instance.
232 */
233 async createCrawler() {
234 await this.initPromise;
235
236 const args = [];
237 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
238
239 const options: PuppeteerCrawlerOptions = {
240 requestHandler: this._requestHandler.bind(this),
241 requestList: this.requestList,
242 requestQueue: this.requestQueue,
243 requestHandlerTimeoutSecs: this.devtools
244 ? DEVTOOLS_TIMEOUT_SECS
245 : this.input.pageFunctionTimeoutSecs,
246 preNavigationHooks: [],
247 postNavigationHooks: [],
248 failedRequestHandler: this._failedRequestHandler.bind(this),
249 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
250 maxConcurrency: this.input.maxConcurrency,
251 maxRequestRetries: this.input.maxRequestRetries,
252 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
253 proxyConfiguration: (await Actor.createProxyConfiguration(
254 this.input.proxyConfiguration,
255 )) as any as ProxyConfiguration,
256 launchContext: {
257 useChrome: this.input.useChrome,
258 launchOptions: {
259 acceptInsecureCerts: this.input.ignoreSslErrors,
260 defaultViewport: DEFAULT_VIEWPORT,
261 devtools: this.devtools,
262 args,
263 headless: this.input.headless,
264 },
265 },
266 useSessionPool: true,
267 persistCookiesPerSession: true,
268 sessionPoolOptions: {
269 persistStateKeyValueStoreId: this.input.sessionPoolName
270 ? SESSION_STORE_NAME
271 : undefined,
272 persistStateKey: this.input.sessionPoolName,
273 sessionOptions: {
274 maxUsageCount: this.maxSessionUsageCount,
275 },
276 },
277 experiments: {
278 requestLocking: true,
279 },
280 };
281
282 this._createNavigationHooks(options);
283
284 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
285 options.sessionPoolOptions!.maxPoolSize = 1;
286 }
287
288 this.crawler = new PuppeteerCrawler(options);
289
290 return this.crawler;
291 }
292
293 private _createNavigationHooks(options: PuppeteerCrawlerOptions) {
294 options.preNavigationHooks!.push(
295 async ({ request, page, session, blockRequests }, gotoOptions) => {
296 // Attach a console listener to get all logs from Browser context.
297 if (this.input.browserLog) browserTools.dumpConsole(page);
298
299 // Prevent download of stylesheets and media, unless selected otherwise
300 if (this.blockedUrlPatterns.length) {
301 await blockRequests({
302 urlPatterns: this.blockedUrlPatterns,
303 });
304 }
305
306 // Add initial cookies, if any.
307 if (
308 this.input.initialCookies &&
309 this.input.initialCookies.length
310 ) {
311 const cookiesToSet = session
312 ? tools.getMissingCookiesFromSession(
313 session,
314 this.input.initialCookies,
315 request.url,
316 )
317 : this.input.initialCookies;
318
319 if (cookiesToSet?.length) {
320 // setting initial cookies that are not already in the session and page
321 session?.setCookies(cookiesToSet, request.url);
322 await page.setCookie(...cookiesToSet);
323 }
324 }
325
326 // Disable content security policy.
327 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
328
329 if (gotoOptions) {
330 gotoOptions.timeout =
331 (this.devtools
332 ? DEVTOOLS_TIMEOUT_SECS
333 : this.input.pageLoadTimeoutSecs) * 1000;
334 gotoOptions.waitUntil = this.input.waitUntil;
335 }
336 },
337 );
338
339 options.preNavigationHooks!.push(
340 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
341 );
342 options.postNavigationHooks!.push(
343 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
344 );
345 }
346
347 private _runHookWithEnhancedContext(
348 hooks: ((...args: unknown[]) => Awaitable<void>)[],
349 ) {
350 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
351 const { customData } = this.input;
352 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
353 });
354 }
355
356 private async _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
357 const lastError =
358 request.errorMessages[request.errorMessages.length - 1];
359 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
360 log.error(
361 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
362 );
363 return this._handleResult(request, undefined, undefined, true);
364 }
365
366 /**
367 * First of all, it initializes the state that is exposed to the user via
368 * `pageFunction` context.
369 *
370 * Then it invokes the user provided `pageFunction` with the prescribed context
371 * and saves its return value.
372 *
373 * Finally, it makes decisions based on the current state and post-processes
374 * the data returned from the `pageFunction`.
375 */
376 private async _requestHandler(crawlingContext: PuppeteerCrawlingContext) {
377 const { request, response, crawler } = crawlingContext;
378
379 /**
380 * PRE-PROCESSING
381 */
382 // Make sure that an object containing internal metadata
383 // is present on every request.
384 tools.ensureMetaData(request);
385
386 // Abort the crawler if the maximum number of results was reached.
387 const aborted = await this._handleMaxResultsPerCrawl(
388 crawler.autoscaledPool,
389 );
390 if (aborted) return;
391
392 const pageFunctionArguments: Dictionary = {};
393
394 // We must use properties and descriptors not to trigger getters / setters.
395 Object.defineProperties(
396 pageFunctionArguments,
397 Object.getOwnPropertyDescriptors(crawlingContext),
398 );
399
400 pageFunctionArguments.response = {
401 status: response && response.status(),
402 headers: response && response.headers(),
403 };
404
405 // Setup and create Context.
406 const contextOptions = {
407 crawlerSetup: {
408 rawInput: this.rawInput,
409 env: this.env,
410 globalStore: this.globalStore,
411 requestQueue: this.requestQueue,
412 keyValueStore: this.keyValueStore,
413 customData: this.input.customData,
414 },
415 pageFunctionArguments,
416 };
417 const { context, state } = createContext(contextOptions);
418
419 if (this.input.closeCookieModals) {
420 await sleep(500);
421 await crawlingContext.page.evaluate(getInjectableScript());
422 await sleep(2000);
423 }
424
425 if (this.input.maxScrollHeightPixels > 0) {
426 await crawlingContext.infiniteScroll({
427 maxScrollHeight: this.input.maxScrollHeightPixels,
428 });
429 }
430
431 /**
432 * USER FUNCTION INVOCATION
433 */
434 const pageFunctionResult = await this.evaledPageFunction(context);
435
436 /**
437 * POST-PROCESSING
438 */
439 // Enqueue more links if Pseudo URLs and a link selector are available,
440 // unless the user invoked the `skipLinks()` context function
441 // or maxCrawlingDepth would be exceeded.
442 if (!state.skipLinks) await this._handleLinks(crawlingContext);
443
444 // Save the `pageFunction`s result to the default dataset.
445 await this._handleResult(
446 request,
447 response,
448 pageFunctionResult as Dictionary,
449 );
450 }
451
452 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
453 if (
454 !this.input.maxResultsPerCrawl ||
455 this.pagesOutputted < this.input.maxResultsPerCrawl
456 )
457 return false;
458 if (!autoscaledPool) return false;
459 log.info(
460 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
461 );
462 await autoscaledPool.abort();
463 return true;
464 }
465
466 private async _handleLinks({
467 request,
468 enqueueLinks,
469 enqueueLinksByClickingElements,
470 }: PuppeteerCrawlingContext) {
471 if (!this.requestQueue) return;
472 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
473 .depth;
474 const hasReachedMaxDepth =
475 this.input.maxCrawlingDepth &&
476 currentDepth >= this.input.maxCrawlingDepth;
477 if (hasReachedMaxDepth) {
478 log.debug(
479 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
480 );
481 return;
482 }
483
484 const enqueueOptions: EnqueueLinksOptions = {
485 globs: this.input.globs,
486 pseudoUrls: this.input.pseudoUrls,
487 exclude: this.input.excludes,
488 transformRequestFunction: (requestOptions) => {
489 requestOptions.userData ??= {};
490 requestOptions.userData[META_KEY] = {
491 parentRequestId: request.id || request.uniqueKey,
492 depth: currentDepth + 1,
493 };
494
495 requestOptions.useExtendedUniqueKey = true;
496 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
497 return requestOptions;
498 },
499 };
500
501 if (this.input.linkSelector) {
502 await enqueueLinks({
503 ...enqueueOptions,
504 selector: this.input.linkSelector,
505 });
506 }
507
508 if (this.input.clickableElementsSelector) {
509 await enqueueLinksByClickingElements({
510 ...enqueueOptions,
511 selector: this.input.clickableElementsSelector,
512 } as EnqueueLinksByClickingElementsOptions);
513 }
514 }
515
516 private async _handleResult(
517 request: Request,
518 response?: HTTPResponse,
519 pageFunctionResult?: Dictionary,
520 isError?: boolean,
521 ) {
522 const payload = tools.createDatasetPayload(
523 request,
524 response,
525 pageFunctionResult,
526 isError,
527 );
528 await this.dataset.pushData(payload);
529 this.pagesOutputted++;
530 }
531}