
Camoufox Scraper
Pricing
Pay per usage
Go to Store

Camoufox Scraper
Crawls websites with stealthy Camoufox browser and Playwright library using a provided server-side Node.js code. Supports both recursive crawling and a list of URLs. Supports login to a website.
0.0 (0)
Pricing
Pay per usage
0
Monthly users
2
Runs succeeded
21%
Last modified
4 days ago
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8dist
9!./storage/key_value_stores/default/INPUT.json
10
11# installed files
12node_modules
Dockerfile
1FROM apify/actor-node-playwright-firefox:22 AS builder
2
3COPY package*.json ./
4
5RUN npm install --include=dev --audit=false
6
7COPY . ./
8
9RUN npm run build
10
11FROM apify/actor-node-playwright-firefox:22
12
13COPY /home/myuser/dist ./dist
14
15COPY package*.json ./
16
17RUN npm --quiet set progress=false \
18 && npm install --omit=dev \
19 && echo "Installed NPM packages:" \
20 && (npm list --omit=dev --all || true) \
21 && echo "Node.js version:" \
22 && node --version \
23 && echo "NPM version:" \
24 && npm --version \
25 && rm -r ~/.npm
26
27RUN npm run get-binaries
28
29COPY . ./
30
31RUN rm -rf ./pw-browsers/
32
33ENV APIFY_DISABLE_OUTDATED_WARNING=1
34
35CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
INPUT_SCHEMA.json
1{
2 "$schema": "https://raw.githubusercontent.com/apify/apify-shared-js/refs/heads/master/packages/input_schema/src/schema.json",
3 "title": "Camoufox Scraper Input",
4 "description": "Use the following form to configure Camoufox Scraper. The Start URLs and Page function are required and all other fields are optional. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the Actor's README in the ACTOR INFO tab.",
5 "type": "object",
6 "schemaVersion": 1,
7 "properties": {
8 "startUrls": {
9 "sectionCaption": "Basic configuration",
10 "title": "Start URLs",
11 "type": "array",
12 "description": "URLs to start with",
13 "prefill": [{ "url": "https://crawlee.dev" }],
14 "editor": "requestListSources"
15 },
16 "globs": {
17 "title": "Glob Patterns",
18 "type": "array",
19 "description": "Glob patterns to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Glob patterns will cause the scraper to enqueue all links matched by the Link selector.",
20 "editor": "globs",
21 "default": [],
22 "prefill": [
23 {
24 "glob": "https://crawlee.dev/*/*"
25 }
26 ]
27 },
28 "pseudoUrls": {
29 "title": "Pseudo-URLs",
30 "type": "array",
31 "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
32 "editor": "pseudoUrls",
33 "default": [],
34 "prefill": []
35 },
36 "excludes": {
37 "title": "Exclude Glob Patterns",
38 "type": "array",
39 "description": "Glob patterns to match links in the page that you want to exclude from being enqueued.",
40 "editor": "globs",
41 "default": [],
42 "prefill": [
43 {
44 "glob": "/**/*.{png,jpg,jpeg,pdf}"
45 }
46 ]
47 },
48 "linkSelector": {
49 "title": "Link selector",
50 "type": "string",
51 "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
52 "editor": "textfield",
53 "prefill": "a"
54 },
55 "keepUrlFragments": {
56 "title": "Keep URL fragments",
57 "type": "boolean",
58 "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
59 "default": false
60 },
61 "respectRobotsTxtFile": {
62 "title": "Respect the robots.txt file",
63 "type": "boolean",
64 "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
65 "default": false,
66 "prefill": true
67 },
68 "pageFunction": {
69 "title": "Page function",
70 "type": "string",
71 "description": "Function executed for each request",
72 "prefill": "async function pageFunction(context) {\n const { page, request, log } = context;\n const title = await page.title();\n log.info(`URL: ${request.url} TITLE: ${title}`);\n return {\n url: request.url,\n title\n };\n}",
73 "editor": "javascript"
74 },
75 "proxyConfiguration": {
76 "sectionCaption": "Proxy configuration",
77 "title": "Proxy configuration",
78 "type": "object",
79 "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/camoufox-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
80 "prefill": { "useApifyProxy": true },
81 "default": { "useApifyProxy": true },
82 "editor": "proxy"
83 },
84 "proxyRotation": {
85 "title": "Proxy rotation",
86 "type": "string",
87 "description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.",
88 "default": "RECOMMENDED",
89 "editor": "select",
90 "enum": ["RECOMMENDED", "PER_REQUEST", "UNTIL_FAILURE"],
91 "enumTitles": [
92 "Use recommended settings",
93 "Rotate proxy after each request",
94 "Use one proxy until failure"
95 ]
96 },
97 "sessionPoolName": {
98 "title": "Session pool name",
99 "type": "string",
100 "description": "<b>Use only english alphanumeric characters dashes and underscores.</b> A session is a representation of a user. It has it's own IP and cookies which are then used together to emulate a real user. Usage of the sessions is controlled by the Proxy rotation option. By providing a session pool name, you enable sharing of those sessions across multiple Actor runs. This is very useful when you need specific cookies for accessing the websites or when a lot of your proxies are already blocked. Instead of trying randomly, a list of working sessions will be saved and a new Actor run can reuse those sessions. Note that the IP lock on sessions expires after 24 hours, unless the session is used again in that window.",
101 "editor": "textfield",
102 "minLength": 3,
103 "maxLength": 200,
104 "pattern": "[0-9A-z-]"
105 },
106 "initialCookies": {
107 "title": "Initial cookies",
108 "type": "array",
109 "description": "The provided cookies will be pre-set to all pages the scraper opens.",
110 "default": [],
111 "prefill": [],
112 "editor": "json"
113 },
114 "headless": {
115 "title": "Run browsers in headless mode",
116 "type": "boolean",
117 "description": "By default, browsers run in headless mode. You can toggle this off to run them in headful mode, which can help with certain rare anti-scraping protections but is slower and more costly.",
118 "default": true
119 },
120 "ignoreSslErrors": {
121 "title": "Ignore SSL errors",
122 "type": "boolean",
123 "description": "Scraper will ignore SSL certificate errors.",
124 "default": false,
125 "groupCaption": "Security",
126 "groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS."
127 },
128 "downloadMedia": {
129 "sectionCaption": "Performance and limits",
130 "title": "Download media",
131 "type": "boolean",
132 "description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
133 "default": true,
134 "groupCaption": "Resources",
135 "groupDescription": "Settings that help to disable downloading web page resources."
136 },
137 "maxRequestRetries": {
138 "title": "Max request retries",
139 "type": "integer",
140 "description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.",
141 "minimum": 0,
142 "default": 3,
143 "unit": "retries"
144 },
145 "maxPagesPerCrawl": {
146 "title": "Max pages per run",
147 "type": "integer",
148 "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
149 "minimum": 0,
150 "default": 0,
151 "unit": "pages"
152 },
153 "maxResultsPerCrawl": {
154 "title": "Max result records",
155 "type": "integer",
156 "description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.",
157 "minimum": 0,
158 "default": 0,
159 "unit": "results"
160 },
161 "maxCrawlingDepth": {
162 "title": "Max crawling depth",
163 "type": "integer",
164 "description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.",
165 "minimum": 0,
166 "default": 0
167 },
168 "maxConcurrency": {
169 "title": "Max concurrency",
170 "type": "integer",
171 "description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.",
172 "minimum": 1,
173 "default": 50
174 },
175 "pageLoadTimeoutSecs": {
176 "title": "Page load timeout",
177 "type": "integer",
178 "description": "Maximum time the scraper will allow a web page to load in seconds.",
179 "minimum": 1,
180 "default": 60,
181 "unit": "secs"
182 },
183 "pageFunctionTimeoutSecs": {
184 "title": "Page function timeout",
185 "type": "integer",
186 "description": "Maximum time the scraper will wait for the page function to execute in seconds.",
187 "minimum": 1,
188 "default": 60,
189 "unit": "secs"
190 },
191 "waitUntil": {
192 "title": "Navigation wait until",
193 "description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code> and <code>networkidle</code> <a href=\"https://playwright.dev/docs/api/class-page#page-goto-option-wait-until\" target=\"_blank\">See Playwright docs</a>.",
194 "type": "string",
195 "editor": "select",
196 "enum": ["networkidle", "load", "domcontentloaded"],
197 "prefill": "networkidle",
198 "default": "networkidle"
199 },
200 "preNavigationHooks": {
201 "sectionCaption": "Advanced configuration",
202 "title": "Pre-navigation hooks",
203 "type": "string",
204 "description": "Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `page.goto()` function the crawler calls to navigate.",
205 "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept two arguments: the \"crawlingContext\" object\n// and \"gotoOptions\".\n[\n async (crawlingContext, gotoOptions) => {\n const { page } = crawlingContext;\n // ...\n },\n]",
206 "editor": "javascript"
207 },
208 "postNavigationHooks": {
209 "title": "Post-navigation hooks",
210 "type": "string",
211 "description": "Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts `crawlingContext` as the only parameter.",
212 "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept a single argument: the \"crawlingContext\" object.\n[\n async (crawlingContext) => {\n const { page } = crawlingContext;\n // ...\n },\n]",
213 "editor": "javascript"
214 },
215 "closeCookieModals": {
216 "title": "Dismiss cookie modals",
217 "type": "boolean",
218 "description": "Using the [I don't care about cookies](https://addons.mozilla.org/en-US/firefox/addon/i-dont-care-about-cookies/) browser extension. When on, the crawler will automatically try to dismiss cookie consent modals. This can be useful when crawling European websites that show cookie consent modals.",
219 "default": false
220 },
221 "maxScrollHeightPixels": {
222 "title": "Maximum scrolling distance in pixels",
223 "type": "integer",
224 "description": "The crawler will scroll down the page until all content is loaded or the maximum scrolling distance is reached. Setting this to `0` disables scrolling altogether.",
225 "default": 5000
226 },
227 "debugLog": {
228 "title": "Debug log",
229 "type": "boolean",
230 "description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.",
231 "default": false,
232 "groupCaption": "Enable logs",
233 "groupDescription": "Logs settings"
234 },
235 "browserLog": {
236 "title": "Browser log",
237 "type": "boolean",
238 "description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
239 "default": false
240 },
241 "customData": {
242 "title": "Custom data",
243 "type": "object",
244 "description": "This object will be available on pageFunction's context as customData.",
245 "default": {},
246 "prefill": {},
247 "editor": "json"
248 },
249 "datasetName": {
250 "title": "Dataset name",
251 "type": "string",
252 "description": "Name or ID of the dataset that will be used for storing results. If left empty, the default dataset of the run will be used.",
253 "editor": "textfield"
254 },
255 "keyValueStoreName": {
256 "title": "Key-value store name",
257 "type": "string",
258 "description": "Name or ID of the key-value store that will be used for storing records. If left empty, the default key-value store of the run will be used.",
259 "editor": "textfield"
260 },
261 "requestQueueName": {
262 "title": "Request queue name",
263 "type": "string",
264 "description": "Name of the request queue that will be used for storing requests. If left empty, the default request queue of the run will be used.",
265 "editor": "textfield"
266 },
267 "os": {
268 "sectionCaption": "Camoufox options",
269 "title": "Emulated operating system",
270 "type": "array",
271 "description": "Operating system to use for the fingerprint generation. Can be 'windows', 'macos', 'linux', or a list to randomly choose from.",
272 "editor": "select",
273 "items": {
274 "type": "string",
275 "enum": ["linux", "windows", "macos"],
276 "enumTitles": ["Linux", "Windows", "MacOS"]
277 }
278 },
279 "block_images": {
280 "title": "Block image loading",
281 "type": "boolean",
282 "description": "Blocks the image loading on the page. Saves bandwidth and speeds up the scraping, but might cause detection of the scraper.",
283 "default": false
284 },
285 "block_webrtc": {
286 "title": "Block WebRTC",
287 "type": "boolean",
288 "description": "Blocks WebRTC capabilities in Camoufox. Note that this might break some web applications.",
289 "default": false
290 },
291 "block_webgl": {
292 "title": "Block WebGL",
293 "type": "boolean",
294 "description": "Blocks WebGL capabilities in Camoufox. Note that this might break some websites."
295 },
296 "disable_coop": {
297 "title": "Disable Cross-Origin-Opener-Policy",
298 "type": "boolean",
299 "description": "Disables the Cross-Origin-Opener-Policy."
300 },
301 "geoip": {
302 "title": "GeoIP (⚠️ might not fully work with Apify Proxy ⚠️).",
303 "description": "Calculate geo-data based on IP address. Might not fully work with Apify Proxy.",
304 "type": "boolean",
305 "default": false
306 },
307 "humanize": {
308 "title": "Humanize cursor movement (speed in seconds)",
309 "type": "string",
310 "editor": "textfield",
311 "description": "Humanize cursor movement."
312 },
313 "locale": {
314 "title": "Locales",
315 "type": "array",
316 "editor": "stringList",
317 "description": "Locales to use in the browser / OS."
318 },
319 "fonts": {
320 "title": "Fonts",
321 "type": "array",
322 "editor": "stringList",
323 "description": "Fonts to load into the browser / OS."
324 },
325 "custom_fonts_only": {
326 "title": "Custom fonts only",
327 "type": "boolean",
328 "description": "If enabled, only custom fonts (from the option `fonts`) are used. This will disable the default OS fonts."
329 },
330 "enable_cache": {
331 "title": "Enable cache",
332 "type": "boolean",
333 "description": "Cache previous pages and requests in Camoufox."
334 },
335 "debug": {
336 "title": "Camoufox debug",
337 "type": "boolean",
338 "description": "Prints the config being sent to Camoufox."
339 }
340 },
341 "required": ["startUrls", "pageFunction", "proxyConfiguration"]
342}
package.json
1{
2 "name": "actor-camoufox-scraper",
3 "version": "3.1.0",
4 "private": true,
5 "description": "Crawl web pages using Apify, headless browser and Playwright with Camoufox",
6 "type": "module",
7 "dependencies": {
8 "@apify/scraper-tools": "^1.1.4",
9 "@crawlee/core": "^3.13.2",
10 "@crawlee/playwright": "^3.13.2",
11 "@crawlee/utils": "^3.13.2",
12 "apify": "^3.2.6",
13 "camoufox-js": "^0.3.0",
14 "idcac-playwright": "^0.1.3",
15 "playwright": "*"
16 },
17 "devDependencies": {
18 "@apify/tsconfig": "^0.1.0",
19 "@types/node": "^22.7.4",
20 "tsx": "^4.19.1",
21 "typescript": "~5.8.0"
22 },
23 "scripts": {
24 "start": "npm run start:dev",
25 "start:prod": "node dist/main.js",
26 "start:dev": "tsx src/main.ts",
27 "build": "tsc",
28 "get-binaries": "camoufox-js fetch"
29 },
30 "repository": {
31 "type": "git",
32 "url": "https://github.com/apify/apify-sdk-js"
33 },
34 "author": {
35 "name": "Apify Technologies",
36 "email": "support@apify.com",
37 "url": "https://apify.com"
38 },
39 "license": "Apache-2.0",
40 "homepage": "https://github.com/apify/apify-sdk-js"
41}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "outDir": "dist",
5 "module": "ESNext",
6 "allowJs": true,
7 "skipLibCheck": true
8 },
9 "include": ["src"]
10}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "camoufox-scraper",
4 "version": "0.1",
5 "buildTag": "latest"
6}
src/main.ts
1import { runActor } from '@apify/scraper-tools';
2
3import { CrawlerSetup } from './internals/crawler_setup.js';
4
5runActor(CrawlerSetup);
src/internals/consts.ts
1import type {
2 GlobInput,
3 ProxyConfigurationOptions,
4 PseudoUrlInput,
5 RegExpInput,
6 RequestOptions,
7 Session,
8} from '@crawlee/core';
9import type { Dictionary } from '@crawlee/utils';
10
11export const enum ProxyRotation {
12 Recommended = 'RECOMMENDED',
13 PerRequest = 'PER_REQUEST',
14 UntilFailure = 'UNTIL_FAILURE',
15}
16
17/**
18 * Replicates the INPUT_SCHEMA with TypeScript types for quick reference
19 * and IDE type check integration.
20 */
21export interface Input {
22 startUrls: RequestOptions[];
23 globs: GlobInput[];
24 regexps: RegExpInput[];
25 pseudoUrls: PseudoUrlInput[];
26 excludes: GlobInput[];
27 linkSelector?: string;
28 keepUrlFragments: boolean;
29 respectRobotsTxtFile: boolean;
30 pageFunction: string;
31 preNavigationHooks?: string;
32 postNavigationHooks?: string;
33 proxyConfiguration: ProxyConfigurationOptions;
34 proxyRotation: ProxyRotation;
35 sessionPoolName?: string;
36 initialCookies: Parameters<Session['setCookies']>[0];
37 maxScrollHeightPixels: number;
38 ignoreSslErrors: boolean;
39 ignoreCorsAndCsp: boolean;
40 closeCookieModals: boolean;
41 downloadMedia: boolean;
42 maxRequestRetries: number;
43 maxPagesPerCrawl: number;
44 maxResultsPerCrawl: number;
45 maxCrawlingDepth: number;
46 maxConcurrency: number;
47 pageLoadTimeoutSecs: number;
48 pageFunctionTimeoutSecs: number;
49 waitUntil: 'networkidle' | 'load' | 'domcontentloaded';
50 debugLog: boolean;
51 browserLog: boolean;
52 customData: Dictionary;
53 datasetName?: string;
54 keyValueStoreName?: string;
55 requestQueueName?: string;
56 headless: boolean;
57 os: ('linux' | 'macos' | 'windows')[];
58 blockImages: boolean;
59 blockWebrtc: boolean;
60 blockWebgl: boolean;
61 disableCoop: boolean;
62 geoip: boolean;
63 humanize: string;
64 locale: string[];
65 fonts: string[];
66 customFontsOnly: boolean;
67 enableCache: boolean;
68 debug: boolean;
69}
src/internals/crawler_setup.ts
1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksOptions,
8 PlaywrightCrawlerOptions,
9 PlaywrightCrawlingContext,
10 PlaywrightLaunchContext,
11 ProxyConfiguration,
12 Request,
13} from '@crawlee/playwright';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PlaywrightCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { launchOptions } from 'camoufox-js';
27import { getInjectableScript } from 'idcac-playwright';
28import type { Response } from 'playwright';
29import { firefox } from 'playwright';
30
31import type {
32 CrawlerSetupOptions,
33 RequestMetadata,
34} from '@apify/scraper-tools';
35import {
36 browserTools,
37 constants as scraperToolsConstants,
38 createContext,
39 tools,
40} from '@apify/scraper-tools';
41
42import type { Input } from './consts.js';
43import { ProxyRotation } from './consts.js';
44
45const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
46
47const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
48 scraperToolsConstants;
49const SCHEMA = JSON.parse(
50 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
51);
52
53/**
54 * Holds all the information necessary for constructing a crawler
55 * instance and creating a context for a pageFunction invocation.
56 */
57export class CrawlerSetup implements CrawlerSetupOptions {
58 name = 'Playwright Scraper';
59 rawInput: string;
60 env: ApifyEnv;
61 /**
62 * Used to store data that persist navigations
63 */
64 globalStore = new Map();
65 requestQueue: RequestQueueV2;
66 keyValueStore: KeyValueStore;
67 customData: unknown;
68 input: Input;
69 maxSessionUsageCount: number;
70 evaledPageFunction: (...args: unknown[]) => unknown;
71 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
72 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
73 devtools: boolean;
74 datasetName?: string;
75 keyValueStoreName?: string;
76 requestQueueName?: string;
77
78 crawler!: PlaywrightCrawler;
79 requestList!: RequestList;
80 dataset!: Dataset;
81 pagesOutputted!: number;
82 private initPromise: Promise<void>;
83
84 constructor(input: Input) {
85 // Set log level early to prevent missed messages.
86 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
87
88 // Keep this as string to be immutable.
89 this.rawInput = JSON.stringify(input);
90
91 // Attempt to load page function from disk if not present on input.
92 tools.maybeLoadPageFunctionFromDisk(
93 input,
94 dirname(fileURLToPath(import.meta.url)),
95 );
96
97 // Validate INPUT if not running on Apify Cloud Platform.
98 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
99
100 this.input = input;
101 this.env = Actor.getEnv();
102
103 // Validations
104 this.input.pseudoUrls.forEach((purl) => {
105 if (!tools.isPlainObject(purl)) {
106 throw new Error(
107 'The pseudoUrls Array must only contain Objects.',
108 );
109 }
110 if (purl.userData && !tools.isPlainObject(purl.userData)) {
111 throw new Error(
112 'The userData property of a pseudoUrl must be an Object.',
113 );
114 }
115 });
116
117 this.input.initialCookies.forEach((cookie) => {
118 if (!tools.isPlainObject(cookie)) {
119 throw new Error(
120 'The initialCookies Array must only contain Objects.',
121 );
122 }
123 });
124
125 if (
126 !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
127 ) {
128 throw new Error(
129 'Navigation wait until event must be valid. See tooltip.',
130 );
131 }
132
133 // solving proxy rotation settings
134 this.maxSessionUsageCount =
135 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
136
137 // Functions need to be evaluated.
138 this.evaledPageFunction = tools.evalFunctionOrThrow(
139 this.input.pageFunction,
140 );
141
142 if (this.input.preNavigationHooks) {
143 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
144 this.input.preNavigationHooks,
145 'preNavigationHooks',
146 );
147 } else {
148 this.evaledPreNavigationHooks = [];
149 }
150
151 if (this.input.postNavigationHooks) {
152 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
153 this.input.postNavigationHooks,
154 'postNavigationHooks',
155 );
156 } else {
157 this.evaledPostNavigationHooks = [];
158 }
159
160 // Start Chromium with Debugger any time the page function includes the keyword.
161 this.devtools = this.input.pageFunction.includes('debugger;');
162
163 // Named storages
164 this.datasetName = this.input.datasetName;
165 this.keyValueStoreName = this.input.keyValueStoreName;
166 this.requestQueueName = this.input.requestQueueName;
167
168 // Initialize async operations.
169 this.crawler = null!;
170 this.requestList = null!;
171 this.requestQueue = null!;
172 this.dataset = null!;
173 this.keyValueStore = null!;
174 this.initPromise = this._initializeAsync();
175 }
176
177 private async _initializeAsync() {
178 // RequestList
179 const startUrls = this.input.startUrls.map((req) => {
180 req.useExtendedUniqueKey = true;
181 req.keepUrlFragment = this.input.keepUrlFragments;
182 return req;
183 });
184
185 this.requestList = await RequestList.open(
186 'PLAYWRIGHT_SCRAPER',
187 startUrls,
188 );
189
190 // RequestQueue
191 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
192
193 // Dataset
194 this.dataset = await Dataset.open(this.datasetName);
195 const info = await this.dataset.getInfo();
196 this.pagesOutputted = info?.itemCount ?? 0;
197
198 // KeyValueStore
199 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
200 }
201
202 /**
203 * Resolves to a `PlaywrightCrawler` instance.
204 */
205 async createCrawler() {
206 await this.initPromise;
207
208 const options: PlaywrightCrawlerOptions = {
209 requestHandler: this._requestHandler.bind(this),
210 requestList: this.requestList,
211 requestQueue: this.requestQueue,
212 requestHandlerTimeoutSecs: this.devtools
213 ? DEVTOOLS_TIMEOUT_SECS
214 : this.input.pageFunctionTimeoutSecs,
215 preNavigationHooks: [],
216 postNavigationHooks: [],
217 failedRequestHandler: this._failedRequestHandler.bind(this),
218 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
219 maxConcurrency: this.input.maxConcurrency,
220 maxRequestRetries: this.input.maxRequestRetries,
221 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
222 proxyConfiguration: (await Actor.createProxyConfiguration(
223 this.input.proxyConfiguration,
224 )) as any as ProxyConfiguration,
225 launchContext: {
226 launcher: firefox,
227 launchOptions: await launchOptions({
228 ...this.input,
229 humanize: this.input.humanize
230 ? Number(this.input.humanize)
231 : 0,
232 }),
233 } as PlaywrightLaunchContext,
234 useSessionPool: true,
235 persistCookiesPerSession: true,
236 sessionPoolOptions: {
237 persistStateKeyValueStoreId: this.input.sessionPoolName
238 ? SESSION_STORE_NAME
239 : undefined,
240 persistStateKey: this.input.sessionPoolName,
241 sessionOptions: {
242 maxUsageCount: this.maxSessionUsageCount,
243 },
244 },
245 experiments: {
246 requestLocking: true,
247 },
248 };
249
250 this._createNavigationHooks(options);
251
252 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
253 options.sessionPoolOptions!.maxPoolSize = 1;
254 }
255
256 this.crawler = new PlaywrightCrawler(options);
257
258 return this.crawler;
259 }
260
261 private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
262 options.preNavigationHooks!.push(
263 async ({ request, page, session }, gotoOptions) => {
264 // Attach a console listener to get all logs from Browser context.
265 if (this.input.browserLog) browserTools.dumpConsole(page);
266
267 // Add initial cookies, if any.
268 if (
269 this.input.initialCookies &&
270 this.input.initialCookies.length
271 ) {
272 const cookiesToSet = session
273 ? tools.getMissingCookiesFromSession(
274 session,
275 this.input.initialCookies,
276 request.url,
277 )
278 : this.input.initialCookies;
279
280 if (cookiesToSet?.length) {
281 // setting initial cookies that are not already in the session and page
282 session?.setCookies(cookiesToSet, request.url);
283 await page.context().addCookies(cookiesToSet);
284 }
285 }
286
287 if (gotoOptions) {
288 gotoOptions.timeout =
289 (this.devtools
290 ? DEVTOOLS_TIMEOUT_SECS
291 : this.input.pageLoadTimeoutSecs) * 1000;
292 gotoOptions.waitUntil = this.input.waitUntil;
293 }
294 },
295 );
296
297 options.preNavigationHooks!.push(
298 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
299 );
300 options.postNavigationHooks!.push(
301 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
302 );
303 }
304
305 private _runHookWithEnhancedContext(
306 hooks: ((...args: unknown[]) => Awaitable<void>)[],
307 ) {
308 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
309 const { customData } = this.input;
310 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
311 });
312 }
313
314 private async _failedRequestHandler({
315 request,
316 }: PlaywrightCrawlingContext) {
317 const lastError =
318 request.errorMessages[request.errorMessages.length - 1];
319 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
320 log.error(
321 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
322 );
323 return this._handleResult(request, undefined, undefined, true);
324 }
325
326 /**
327 * First of all, it initializes the state that is exposed to the user via
328 * `pageFunction` context.
329 *
330 * Then it invokes the user provided `pageFunction` with the prescribed context
331 * and saves its return value.
332 *
333 * Finally, it makes decisions based on the current state and post-processes
334 * the data returned from the `pageFunction`.
335 */
336 private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
337 const { request, response, crawler } = crawlingContext;
338
339 /**
340 * PRE-PROCESSING
341 */
342 // Make sure that an object containing internal metadata
343 // is present on every request.
344 tools.ensureMetaData(request);
345
346 // Abort the crawler if the maximum number of results was reached.
347 const aborted = await this._handleMaxResultsPerCrawl(
348 crawler.autoscaledPool,
349 );
350 if (aborted) return;
351
352 const pageFunctionArguments: Dictionary = {};
353
354 // We must use properties and descriptors not to trigger getters / setters.
355 Object.defineProperties(
356 pageFunctionArguments,
357 Object.getOwnPropertyDescriptors(crawlingContext),
358 );
359
360 pageFunctionArguments.response = {
361 status: response && response.status(),
362 headers: response && response.headers(),
363 };
364
365 // Setup and create Context.
366 const contextOptions = {
367 crawlerSetup: {
368 rawInput: this.rawInput,
369 env: this.env,
370 globalStore: this.globalStore,
371 requestQueue: this.requestQueue,
372 keyValueStore: this.keyValueStore,
373 customData: this.input.customData,
374 },
375 pageFunctionArguments,
376 };
377 const { context, state } = createContext(contextOptions);
378
379 if (this.input.closeCookieModals) {
380 await sleep(500);
381 await crawlingContext.page.evaluate(getInjectableScript());
382 await sleep(2000);
383 }
384
385 if (this.input.maxScrollHeightPixels > 0) {
386 await crawlingContext.infiniteScroll({
387 maxScrollHeight: this.input.maxScrollHeightPixels,
388 });
389 }
390
391 /**
392 * USER FUNCTION INVOCATION
393 */
394 const pageFunctionResult = await this.evaledPageFunction(context);
395
396 /**
397 * POST-PROCESSING
398 */
399 // Enqueue more links if Pseudo URLs and a link selector are available,
400 // unless the user invoked the `skipLinks()` context function
401 // or maxCrawlingDepth would be exceeded.
402 if (!state.skipLinks) await this._handleLinks(crawlingContext);
403
404 // Save the `pageFunction`s result to the default dataset.
405 await this._handleResult(
406 request,
407 response,
408 pageFunctionResult as Dictionary,
409 );
410 }
411
412 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
413 if (
414 !this.input.maxResultsPerCrawl ||
415 this.pagesOutputted < this.input.maxResultsPerCrawl
416 )
417 return false;
418 if (!autoscaledPool) return false;
419 log.info(
420 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
421 );
422 await autoscaledPool.abort();
423 return true;
424 }
425
426 private async _handleLinks({
427 request,
428 enqueueLinks,
429 }: PlaywrightCrawlingContext) {
430 if (!this.requestQueue) return;
431 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
432 .depth;
433 const hasReachedMaxDepth =
434 this.input.maxCrawlingDepth &&
435 currentDepth >= this.input.maxCrawlingDepth;
436 if (hasReachedMaxDepth) {
437 log.debug(
438 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
439 );
440 return;
441 }
442
443 const enqueueOptions: EnqueueLinksOptions = {
444 globs: this.input.globs,
445 pseudoUrls: this.input.pseudoUrls,
446 exclude: this.input.excludes,
447 transformRequestFunction: (requestOptions) => {
448 requestOptions.userData ??= {};
449 requestOptions.userData[META_KEY] = {
450 parentRequestId: request.id || request.uniqueKey,
451 depth: currentDepth + 1,
452 };
453
454 requestOptions.useExtendedUniqueKey = true;
455 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
456 return requestOptions;
457 },
458 };
459
460 if (this.input.linkSelector) {
461 await enqueueLinks({
462 ...enqueueOptions,
463 selector: this.input.linkSelector,
464 });
465 }
466 }
467
468 private async _handleResult(
469 request: Request,
470 response?: Response,
471 pageFunctionResult?: Dictionary,
472 isError?: boolean,
473 ) {
474 const payload = tools.createDatasetPayload(
475 request,
476 response,
477 pageFunctionResult,
478 isError,
479 );
480 await this.dataset.pushData(payload);
481 this.pagesOutputted++;
482 }
483}
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.