Camoufox Scraper avatar
Camoufox Scraper

Pricing

Pay per usage

Go to Store
Camoufox Scraper

Camoufox Scraper

Developed by

Apify

Maintained by Community

Crawls websites with stealthy Camoufox browser and Playwright library using a provided server-side Node.js code. Supports both recursive crawling and a list of URLs. Supports login to a website.

0.0 (0)

Pricing

Pay per usage

0

Monthly users

2

Runs succeeded

21%

Last modified

4 days ago

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8dist
9!./storage/key_value_stores/default/INPUT.json
10
11# installed files
12node_modules

Dockerfile

1FROM apify/actor-node-playwright-firefox:22 AS builder
2
3COPY --chown=myuser package*.json ./
4
5RUN npm install --include=dev --audit=false
6
7COPY --chown=myuser . ./
8
9RUN npm run build
10
11FROM apify/actor-node-playwright-firefox:22
12
13COPY --from=builder --chown=myuser /home/myuser/dist ./dist
14
15COPY --chown=myuser package*.json ./
16
17RUN npm --quiet set progress=false \
18    && npm install --omit=dev \
19    && echo "Installed NPM packages:" \
20    && (npm list --omit=dev --all || true) \
21    && echo "Node.js version:" \
22    && node --version \
23    && echo "NPM version:" \
24    && npm --version \
25    && rm -r ~/.npm
26
27RUN npm run get-binaries
28
29COPY --chown=myuser . ./
30
31RUN rm -rf ./pw-browsers/
32
33ENV APIFY_DISABLE_OUTDATED_WARNING=1
34
35CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

INPUT_SCHEMA.json

1{
2    "$schema": "https://raw.githubusercontent.com/apify/apify-shared-js/refs/heads/master/packages/input_schema/src/schema.json",
3    "title": "Camoufox Scraper Input",
4    "description": "Use the following form to configure Camoufox Scraper. The Start URLs and Page function are required and all other fields are optional. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the Actor's README in the ACTOR INFO tab.",
5    "type": "object",
6    "schemaVersion": 1,
7    "properties": {
8        "startUrls": {
9            "sectionCaption": "Basic configuration",
10            "title": "Start URLs",
11            "type": "array",
12            "description": "URLs to start with",
13            "prefill": [{ "url": "https://crawlee.dev" }],
14            "editor": "requestListSources"
15        },
16        "globs": {
17            "title": "Glob Patterns",
18            "type": "array",
19            "description": "Glob patterns to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Glob patterns will cause the scraper to enqueue all links matched by the Link selector.",
20            "editor": "globs",
21            "default": [],
22            "prefill": [
23                {
24                    "glob": "https://crawlee.dev/*/*"
25                }
26            ]
27        },
28        "pseudoUrls": {
29            "title": "Pseudo-URLs",
30            "type": "array",
31            "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
32            "editor": "pseudoUrls",
33            "default": [],
34            "prefill": []
35        },
36        "excludes": {
37            "title": "Exclude Glob Patterns",
38            "type": "array",
39            "description": "Glob patterns to match links in the page that you want to exclude from being enqueued.",
40            "editor": "globs",
41            "default": [],
42            "prefill": [
43                {
44                    "glob": "/**/*.{png,jpg,jpeg,pdf}"
45                }
46            ]
47        },
48        "linkSelector": {
49            "title": "Link selector",
50            "type": "string",
51            "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
52            "editor": "textfield",
53            "prefill": "a"
54        },
55        "keepUrlFragments": {
56            "title": "Keep URL fragments",
57            "type": "boolean",
58            "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
59            "default": false
60        },
61        "respectRobotsTxtFile": {
62            "title": "Respect the robots.txt file",
63            "type": "boolean",
64            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
65            "default": false,
66            "prefill": true
67        },
68        "pageFunction": {
69            "title": "Page function",
70            "type": "string",
71            "description": "Function executed for each request",
72            "prefill": "async function pageFunction(context) {\n    const { page, request, log } = context;\n    const title = await page.title();\n    log.info(`URL: ${request.url} TITLE: ${title}`);\n    return {\n        url: request.url,\n        title\n    };\n}",
73            "editor": "javascript"
74        },
75        "proxyConfiguration": {
76            "sectionCaption": "Proxy configuration",
77            "title": "Proxy configuration",
78            "type": "object",
79            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/camoufox-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
80            "prefill": { "useApifyProxy": true },
81            "default": { "useApifyProxy": true },
82            "editor": "proxy"
83        },
84        "proxyRotation": {
85            "title": "Proxy rotation",
86            "type": "string",
87            "description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.",
88            "default": "RECOMMENDED",
89            "editor": "select",
90            "enum": ["RECOMMENDED", "PER_REQUEST", "UNTIL_FAILURE"],
91            "enumTitles": [
92                "Use recommended settings",
93                "Rotate proxy after each request",
94                "Use one proxy until failure"
95            ]
96        },
97        "sessionPoolName": {
98            "title": "Session pool name",
99            "type": "string",
100            "description": "<b>Use only english alphanumeric characters dashes and underscores.</b> A session is a representation of a user. It has it's own IP and cookies which are then used together to emulate a real user. Usage of the sessions is controlled by the Proxy rotation option. By providing a session pool name, you enable sharing of those sessions across multiple Actor runs. This is very useful when you need specific cookies for accessing the websites or when a lot of your proxies are already blocked. Instead of trying randomly, a list of working sessions will be saved and a new Actor run can reuse those sessions. Note that the IP lock on sessions expires after 24 hours, unless the session is used again in that window.",
101            "editor": "textfield",
102            "minLength": 3,
103            "maxLength": 200,
104            "pattern": "[0-9A-z-]"
105        },
106        "initialCookies": {
107            "title": "Initial cookies",
108            "type": "array",
109            "description": "The provided cookies will be pre-set to all pages the scraper opens.",
110            "default": [],
111            "prefill": [],
112            "editor": "json"
113        },
114        "headless": {
115            "title": "Run browsers in headless mode",
116            "type": "boolean",
117            "description": "By default, browsers run in headless mode. You can toggle this off to run them in headful mode, which can help with certain rare anti-scraping protections but is slower and more costly.",
118            "default": true
119        },
120        "ignoreSslErrors": {
121            "title": "Ignore SSL errors",
122            "type": "boolean",
123            "description": "Scraper will ignore SSL certificate errors.",
124            "default": false,
125            "groupCaption": "Security",
126            "groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS."
127        },
128        "downloadMedia": {
129            "sectionCaption": "Performance and limits",
130            "title": "Download media",
131            "type": "boolean",
132            "description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
133            "default": true,
134            "groupCaption": "Resources",
135            "groupDescription": "Settings that help to disable downloading web page resources."
136        },
137        "maxRequestRetries": {
138            "title": "Max request retries",
139            "type": "integer",
140            "description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.",
141            "minimum": 0,
142            "default": 3,
143            "unit": "retries"
144        },
145        "maxPagesPerCrawl": {
146            "title": "Max pages per run",
147            "type": "integer",
148            "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
149            "minimum": 0,
150            "default": 0,
151            "unit": "pages"
152        },
153        "maxResultsPerCrawl": {
154            "title": "Max result records",
155            "type": "integer",
156            "description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.",
157            "minimum": 0,
158            "default": 0,
159            "unit": "results"
160        },
161        "maxCrawlingDepth": {
162            "title": "Max crawling depth",
163            "type": "integer",
164            "description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.",
165            "minimum": 0,
166            "default": 0
167        },
168        "maxConcurrency": {
169            "title": "Max concurrency",
170            "type": "integer",
171            "description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.",
172            "minimum": 1,
173            "default": 50
174        },
175        "pageLoadTimeoutSecs": {
176            "title": "Page load timeout",
177            "type": "integer",
178            "description": "Maximum time the scraper will allow a web page to load in seconds.",
179            "minimum": 1,
180            "default": 60,
181            "unit": "secs"
182        },
183        "pageFunctionTimeoutSecs": {
184            "title": "Page function timeout",
185            "type": "integer",
186            "description": "Maximum time the scraper will wait for the page function to execute in seconds.",
187            "minimum": 1,
188            "default": 60,
189            "unit": "secs"
190        },
191        "waitUntil": {
192            "title": "Navigation wait until",
193            "description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code> and <code>networkidle</code> <a href=\"https://playwright.dev/docs/api/class-page#page-goto-option-wait-until\" target=\"_blank\">See Playwright docs</a>.",
194            "type": "string",
195            "editor": "select",
196            "enum": ["networkidle", "load", "domcontentloaded"],
197            "prefill": "networkidle",
198            "default": "networkidle"
199        },
200        "preNavigationHooks": {
201            "sectionCaption": "Advanced configuration",
202            "title": "Pre-navigation hooks",
203            "type": "string",
204            "description": "Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `page.goto()` function the crawler calls to navigate.",
205            "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept two arguments: the \"crawlingContext\" object\n// and \"gotoOptions\".\n[\n    async (crawlingContext, gotoOptions) => {\n        const { page } = crawlingContext;\n        // ...\n    },\n]",
206            "editor": "javascript"
207        },
208        "postNavigationHooks": {
209            "title": "Post-navigation hooks",
210            "type": "string",
211            "description": "Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts `crawlingContext` as the only parameter.",
212            "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept a single argument: the \"crawlingContext\" object.\n[\n    async (crawlingContext) => {\n        const { page } = crawlingContext;\n        // ...\n    },\n]",
213            "editor": "javascript"
214        },
215        "closeCookieModals": {
216            "title": "Dismiss cookie modals",
217            "type": "boolean",
218            "description": "Using the [I don't care about cookies](https://addons.mozilla.org/en-US/firefox/addon/i-dont-care-about-cookies/) browser extension. When on, the crawler will automatically try to dismiss cookie consent modals. This can be useful when crawling European websites that show cookie consent modals.",
219            "default": false
220        },
221        "maxScrollHeightPixels": {
222            "title": "Maximum scrolling distance in pixels",
223            "type": "integer",
224            "description": "The crawler will scroll down the page until all content is loaded or the maximum scrolling distance is reached. Setting this to `0` disables scrolling altogether.",
225            "default": 5000
226        },
227        "debugLog": {
228            "title": "Debug log",
229            "type": "boolean",
230            "description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.",
231            "default": false,
232            "groupCaption": "Enable logs",
233            "groupDescription": "Logs settings"
234        },
235        "browserLog": {
236            "title": "Browser log",
237            "type": "boolean",
238            "description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
239            "default": false
240        },
241        "customData": {
242            "title": "Custom data",
243            "type": "object",
244            "description": "This object will be available on pageFunction's context as customData.",
245            "default": {},
246            "prefill": {},
247            "editor": "json"
248        },
249        "datasetName": {
250            "title": "Dataset name",
251            "type": "string",
252            "description": "Name or ID of the dataset that will be used for storing results. If left empty, the default dataset of the run will be used.",
253            "editor": "textfield"
254        },
255        "keyValueStoreName": {
256            "title": "Key-value store name",
257            "type": "string",
258            "description": "Name or ID of the key-value store that will be used for storing records. If left empty, the default key-value store of the run will be used.",
259            "editor": "textfield"
260        },
261        "requestQueueName": {
262            "title": "Request queue name",
263            "type": "string",
264            "description": "Name of the request queue that will be used for storing requests. If left empty, the default request queue of the run will be used.",
265            "editor": "textfield"
266        },
267        "os": {
268            "sectionCaption": "Camoufox options",
269            "title": "Emulated operating system",
270            "type": "array",
271            "description": "Operating system to use for the fingerprint generation. Can be 'windows', 'macos', 'linux', or a list to randomly choose from.",
272            "editor": "select",
273            "items": {
274                "type": "string",
275                "enum": ["linux", "windows", "macos"],
276                "enumTitles": ["Linux", "Windows", "MacOS"]
277            }
278        },
279        "block_images": {
280            "title": "Block image loading",
281            "type": "boolean",
282            "description": "Blocks the image loading on the page. Saves bandwidth and speeds up the scraping, but might cause detection of the scraper.",
283            "default": false
284        },
285        "block_webrtc": {
286            "title": "Block WebRTC",
287            "type": "boolean",
288            "description": "Blocks WebRTC capabilities in Camoufox. Note that this might break some web applications.",
289            "default": false
290        },
291        "block_webgl": {
292            "title": "Block WebGL",
293            "type": "boolean",
294            "description": "Blocks WebGL capabilities in Camoufox. Note that this might break some websites."
295        },
296        "disable_coop": {
297            "title": "Disable Cross-Origin-Opener-Policy",
298            "type": "boolean",
299            "description": "Disables the Cross-Origin-Opener-Policy."
300        },
301        "geoip": {
302            "title": "GeoIP (⚠️ might not fully work with Apify Proxy ⚠️).",
303            "description": "Calculate geo-data based on IP address. Might not fully work with Apify Proxy.",
304            "type": "boolean",
305            "default": false
306        },
307        "humanize": {
308            "title": "Humanize cursor movement (speed in seconds)",
309            "type": "string",
310            "editor": "textfield",
311            "description": "Humanize cursor movement."
312        },
313        "locale": {
314            "title": "Locales",
315            "type": "array",
316            "editor": "stringList",
317            "description": "Locales to use in the browser / OS."
318        },
319        "fonts": {
320            "title": "Fonts",
321            "type": "array",
322            "editor": "stringList",
323            "description": "Fonts to load into the browser / OS."
324        },
325        "custom_fonts_only": {
326            "title": "Custom fonts only",
327            "type": "boolean",
328            "description": "If enabled, only custom fonts (from the option `fonts`) are used. This will disable the default OS fonts."
329        },
330        "enable_cache": {
331            "title": "Enable cache",
332            "type": "boolean",
333            "description": "Cache previous pages and requests in Camoufox."
334        },
335        "debug": {
336            "title": "Camoufox debug",
337            "type": "boolean",
338            "description": "Prints the config being sent to Camoufox."
339        }
340    },
341    "required": ["startUrls", "pageFunction", "proxyConfiguration"]
342}

package.json

1{
2    "name": "actor-camoufox-scraper",
3    "version": "3.1.0",
4    "private": true,
5    "description": "Crawl web pages using Apify, headless browser and Playwright with Camoufox",
6    "type": "module",
7    "dependencies": {
8        "@apify/scraper-tools": "^1.1.4",
9        "@crawlee/core": "^3.13.2",
10        "@crawlee/playwright": "^3.13.2",
11        "@crawlee/utils": "^3.13.2",
12        "apify": "^3.2.6",
13        "camoufox-js": "^0.3.0",
14        "idcac-playwright": "^0.1.3",
15        "playwright": "*"
16    },
17    "devDependencies": {
18        "@apify/tsconfig": "^0.1.0",
19        "@types/node": "^22.7.4",
20        "tsx": "^4.19.1",
21        "typescript": "~5.8.0"
22    },
23    "scripts": {
24        "start": "npm run start:dev",
25        "start:prod": "node dist/main.js",
26        "start:dev": "tsx src/main.ts",
27        "build": "tsc",
28        "get-binaries": "camoufox-js fetch"
29    },
30    "repository": {
31        "type": "git",
32        "url": "https://github.com/apify/apify-sdk-js"
33    },
34    "author": {
35        "name": "Apify Technologies",
36        "email": "support@apify.com",
37        "url": "https://apify.com"
38    },
39    "license": "Apache-2.0",
40    "homepage": "https://github.com/apify/apify-sdk-js"
41}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "outDir": "dist",
5        "module": "ESNext",
6        "allowJs": true,
7        "skipLibCheck": true
8    },
9    "include": ["src"]
10}

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "camoufox-scraper",
4    "version": "0.1",
5    "buildTag": "latest"
6}

src/main.ts

1import { runActor } from '@apify/scraper-tools';
2
3import { CrawlerSetup } from './internals/crawler_setup.js';
4
5runActor(CrawlerSetup);

src/internals/consts.ts

1import type {
2    GlobInput,
3    ProxyConfigurationOptions,
4    PseudoUrlInput,
5    RegExpInput,
6    RequestOptions,
7    Session,
8} from '@crawlee/core';
9import type { Dictionary } from '@crawlee/utils';
10
11export const enum ProxyRotation {
12    Recommended = 'RECOMMENDED',
13    PerRequest = 'PER_REQUEST',
14    UntilFailure = 'UNTIL_FAILURE',
15}
16
17/**
18 * Replicates the INPUT_SCHEMA with TypeScript types for quick reference
19 * and IDE type check integration.
20 */
21export interface Input {
22    startUrls: RequestOptions[];
23    globs: GlobInput[];
24    regexps: RegExpInput[];
25    pseudoUrls: PseudoUrlInput[];
26    excludes: GlobInput[];
27    linkSelector?: string;
28    keepUrlFragments: boolean;
29    respectRobotsTxtFile: boolean;
30    pageFunction: string;
31    preNavigationHooks?: string;
32    postNavigationHooks?: string;
33    proxyConfiguration: ProxyConfigurationOptions;
34    proxyRotation: ProxyRotation;
35    sessionPoolName?: string;
36    initialCookies: Parameters<Session['setCookies']>[0];
37    maxScrollHeightPixels: number;
38    ignoreSslErrors: boolean;
39    ignoreCorsAndCsp: boolean;
40    closeCookieModals: boolean;
41    downloadMedia: boolean;
42    maxRequestRetries: number;
43    maxPagesPerCrawl: number;
44    maxResultsPerCrawl: number;
45    maxCrawlingDepth: number;
46    maxConcurrency: number;
47    pageLoadTimeoutSecs: number;
48    pageFunctionTimeoutSecs: number;
49    waitUntil: 'networkidle' | 'load' | 'domcontentloaded';
50    debugLog: boolean;
51    browserLog: boolean;
52    customData: Dictionary;
53    datasetName?: string;
54    keyValueStoreName?: string;
55    requestQueueName?: string;
56    headless: boolean;
57    os: ('linux' | 'macos' | 'windows')[];
58    blockImages: boolean;
59    blockWebrtc: boolean;
60    blockWebgl: boolean;
61    disableCoop: boolean;
62    geoip: boolean;
63    humanize: string;
64    locale: string[];
65    fonts: string[];
66    customFontsOnly: boolean;
67    enableCache: boolean;
68    debug: boolean;
69}

src/internals/crawler_setup.ts

1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6    AutoscaledPool,
7    EnqueueLinksOptions,
8    PlaywrightCrawlerOptions,
9    PlaywrightCrawlingContext,
10    PlaywrightLaunchContext,
11    ProxyConfiguration,
12    Request,
13} from '@crawlee/playwright';
14import {
15    Dataset,
16    KeyValueStore,
17    log,
18    PlaywrightCrawler,
19    RequestList,
20    RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { launchOptions } from 'camoufox-js';
27import { getInjectableScript } from 'idcac-playwright';
28import type { Response } from 'playwright';
29import { firefox } from 'playwright';
30
31import type {
32    CrawlerSetupOptions,
33    RequestMetadata,
34} from '@apify/scraper-tools';
35import {
36    browserTools,
37    constants as scraperToolsConstants,
38    createContext,
39    tools,
40} from '@apify/scraper-tools';
41
42import type { Input } from './consts.js';
43import { ProxyRotation } from './consts.js';
44
45const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
46
47const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
48    scraperToolsConstants;
49const SCHEMA = JSON.parse(
50    await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
51);
52
53/**
54 * Holds all the information necessary for constructing a crawler
55 * instance and creating a context for a pageFunction invocation.
56 */
57export class CrawlerSetup implements CrawlerSetupOptions {
58    name = 'Playwright Scraper';
59    rawInput: string;
60    env: ApifyEnv;
61    /**
62     * Used to store data that persist navigations
63     */
64    globalStore = new Map();
65    requestQueue: RequestQueueV2;
66    keyValueStore: KeyValueStore;
67    customData: unknown;
68    input: Input;
69    maxSessionUsageCount: number;
70    evaledPageFunction: (...args: unknown[]) => unknown;
71    evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
72    evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
73    devtools: boolean;
74    datasetName?: string;
75    keyValueStoreName?: string;
76    requestQueueName?: string;
77
78    crawler!: PlaywrightCrawler;
79    requestList!: RequestList;
80    dataset!: Dataset;
81    pagesOutputted!: number;
82    private initPromise: Promise<void>;
83
84    constructor(input: Input) {
85        // Set log level early to prevent missed messages.
86        if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
87
88        // Keep this as string to be immutable.
89        this.rawInput = JSON.stringify(input);
90
91        // Attempt to load page function from disk if not present on input.
92        tools.maybeLoadPageFunctionFromDisk(
93            input,
94            dirname(fileURLToPath(import.meta.url)),
95        );
96
97        // Validate INPUT if not running on Apify Cloud Platform.
98        if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
99
100        this.input = input;
101        this.env = Actor.getEnv();
102
103        // Validations
104        this.input.pseudoUrls.forEach((purl) => {
105            if (!tools.isPlainObject(purl)) {
106                throw new Error(
107                    'The pseudoUrls Array must only contain Objects.',
108                );
109            }
110            if (purl.userData && !tools.isPlainObject(purl.userData)) {
111                throw new Error(
112                    'The userData property of a pseudoUrl must be an Object.',
113                );
114            }
115        });
116
117        this.input.initialCookies.forEach((cookie) => {
118            if (!tools.isPlainObject(cookie)) {
119                throw new Error(
120                    'The initialCookies Array must only contain Objects.',
121                );
122            }
123        });
124
125        if (
126            !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
127        ) {
128            throw new Error(
129                'Navigation wait until event must be valid. See tooltip.',
130            );
131        }
132
133        // solving proxy rotation settings
134        this.maxSessionUsageCount =
135            SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
136
137        // Functions need to be evaluated.
138        this.evaledPageFunction = tools.evalFunctionOrThrow(
139            this.input.pageFunction,
140        );
141
142        if (this.input.preNavigationHooks) {
143            this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
144                this.input.preNavigationHooks,
145                'preNavigationHooks',
146            );
147        } else {
148            this.evaledPreNavigationHooks = [];
149        }
150
151        if (this.input.postNavigationHooks) {
152            this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
153                this.input.postNavigationHooks,
154                'postNavigationHooks',
155            );
156        } else {
157            this.evaledPostNavigationHooks = [];
158        }
159
160        // Start Chromium with Debugger any time the page function includes the keyword.
161        this.devtools = this.input.pageFunction.includes('debugger;');
162
163        // Named storages
164        this.datasetName = this.input.datasetName;
165        this.keyValueStoreName = this.input.keyValueStoreName;
166        this.requestQueueName = this.input.requestQueueName;
167
168        // Initialize async operations.
169        this.crawler = null!;
170        this.requestList = null!;
171        this.requestQueue = null!;
172        this.dataset = null!;
173        this.keyValueStore = null!;
174        this.initPromise = this._initializeAsync();
175    }
176
177    private async _initializeAsync() {
178        // RequestList
179        const startUrls = this.input.startUrls.map((req) => {
180            req.useExtendedUniqueKey = true;
181            req.keepUrlFragment = this.input.keepUrlFragments;
182            return req;
183        });
184
185        this.requestList = await RequestList.open(
186            'PLAYWRIGHT_SCRAPER',
187            startUrls,
188        );
189
190        // RequestQueue
191        this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
192
193        // Dataset
194        this.dataset = await Dataset.open(this.datasetName);
195        const info = await this.dataset.getInfo();
196        this.pagesOutputted = info?.itemCount ?? 0;
197
198        // KeyValueStore
199        this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
200    }
201
202    /**
203     * Resolves to a `PlaywrightCrawler` instance.
204     */
205    async createCrawler() {
206        await this.initPromise;
207
208        const options: PlaywrightCrawlerOptions = {
209            requestHandler: this._requestHandler.bind(this),
210            requestList: this.requestList,
211            requestQueue: this.requestQueue,
212            requestHandlerTimeoutSecs: this.devtools
213                ? DEVTOOLS_TIMEOUT_SECS
214                : this.input.pageFunctionTimeoutSecs,
215            preNavigationHooks: [],
216            postNavigationHooks: [],
217            failedRequestHandler: this._failedRequestHandler.bind(this),
218            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
219            maxConcurrency: this.input.maxConcurrency,
220            maxRequestRetries: this.input.maxRequestRetries,
221            maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
222            proxyConfiguration: (await Actor.createProxyConfiguration(
223                this.input.proxyConfiguration,
224            )) as any as ProxyConfiguration,
225            launchContext: {
226                launcher: firefox,
227                launchOptions: await launchOptions({
228                    ...this.input,
229                    humanize: this.input.humanize
230                        ? Number(this.input.humanize)
231                        : 0,
232                }),
233            } as PlaywrightLaunchContext,
234            useSessionPool: true,
235            persistCookiesPerSession: true,
236            sessionPoolOptions: {
237                persistStateKeyValueStoreId: this.input.sessionPoolName
238                    ? SESSION_STORE_NAME
239                    : undefined,
240                persistStateKey: this.input.sessionPoolName,
241                sessionOptions: {
242                    maxUsageCount: this.maxSessionUsageCount,
243                },
244            },
245            experiments: {
246                requestLocking: true,
247            },
248        };
249
250        this._createNavigationHooks(options);
251
252        if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
253            options.sessionPoolOptions!.maxPoolSize = 1;
254        }
255
256        this.crawler = new PlaywrightCrawler(options);
257
258        return this.crawler;
259    }
260
261    private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
262        options.preNavigationHooks!.push(
263            async ({ request, page, session }, gotoOptions) => {
264                // Attach a console listener to get all logs from Browser context.
265                if (this.input.browserLog) browserTools.dumpConsole(page);
266
267                // Add initial cookies, if any.
268                if (
269                    this.input.initialCookies &&
270                    this.input.initialCookies.length
271                ) {
272                    const cookiesToSet = session
273                        ? tools.getMissingCookiesFromSession(
274                              session,
275                              this.input.initialCookies,
276                              request.url,
277                          )
278                        : this.input.initialCookies;
279
280                    if (cookiesToSet?.length) {
281                        // setting initial cookies that are not already in the session and page
282                        session?.setCookies(cookiesToSet, request.url);
283                        await page.context().addCookies(cookiesToSet);
284                    }
285                }
286
287                if (gotoOptions) {
288                    gotoOptions.timeout =
289                        (this.devtools
290                            ? DEVTOOLS_TIMEOUT_SECS
291                            : this.input.pageLoadTimeoutSecs) * 1000;
292                    gotoOptions.waitUntil = this.input.waitUntil;
293                }
294            },
295        );
296
297        options.preNavigationHooks!.push(
298            ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
299        );
300        options.postNavigationHooks!.push(
301            ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
302        );
303    }
304
305    private _runHookWithEnhancedContext(
306        hooks: ((...args: unknown[]) => Awaitable<void>)[],
307    ) {
308        return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
309            const { customData } = this.input;
310            return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
311        });
312    }
313
314    private async _failedRequestHandler({
315        request,
316    }: PlaywrightCrawlingContext) {
317        const lastError =
318            request.errorMessages[request.errorMessages.length - 1];
319        const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
320        log.error(
321            `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
322        );
323        return this._handleResult(request, undefined, undefined, true);
324    }
325
326    /**
327     * First of all, it initializes the state that is exposed to the user via
328     * `pageFunction` context.
329     *
330     * Then it invokes the user provided `pageFunction` with the prescribed context
331     * and saves its return value.
332     *
333     * Finally, it makes decisions based on the current state and post-processes
334     * the data returned from the `pageFunction`.
335     */
336    private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
337        const { request, response, crawler } = crawlingContext;
338
339        /**
340         * PRE-PROCESSING
341         */
342        // Make sure that an object containing internal metadata
343        // is present on every request.
344        tools.ensureMetaData(request);
345
346        // Abort the crawler if the maximum number of results was reached.
347        const aborted = await this._handleMaxResultsPerCrawl(
348            crawler.autoscaledPool,
349        );
350        if (aborted) return;
351
352        const pageFunctionArguments: Dictionary = {};
353
354        // We must use properties and descriptors not to trigger getters / setters.
355        Object.defineProperties(
356            pageFunctionArguments,
357            Object.getOwnPropertyDescriptors(crawlingContext),
358        );
359
360        pageFunctionArguments.response = {
361            status: response && response.status(),
362            headers: response && response.headers(),
363        };
364
365        // Setup and create Context.
366        const contextOptions = {
367            crawlerSetup: {
368                rawInput: this.rawInput,
369                env: this.env,
370                globalStore: this.globalStore,
371                requestQueue: this.requestQueue,
372                keyValueStore: this.keyValueStore,
373                customData: this.input.customData,
374            },
375            pageFunctionArguments,
376        };
377        const { context, state } = createContext(contextOptions);
378
379        if (this.input.closeCookieModals) {
380            await sleep(500);
381            await crawlingContext.page.evaluate(getInjectableScript());
382            await sleep(2000);
383        }
384
385        if (this.input.maxScrollHeightPixels > 0) {
386            await crawlingContext.infiniteScroll({
387                maxScrollHeight: this.input.maxScrollHeightPixels,
388            });
389        }
390
391        /**
392         * USER FUNCTION INVOCATION
393         */
394        const pageFunctionResult = await this.evaledPageFunction(context);
395
396        /**
397         * POST-PROCESSING
398         */
399        // Enqueue more links if Pseudo URLs and a link selector are available,
400        // unless the user invoked the `skipLinks()` context function
401        // or maxCrawlingDepth would be exceeded.
402        if (!state.skipLinks) await this._handleLinks(crawlingContext);
403
404        // Save the `pageFunction`s result to the default dataset.
405        await this._handleResult(
406            request,
407            response,
408            pageFunctionResult as Dictionary,
409        );
410    }
411
412    private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
413        if (
414            !this.input.maxResultsPerCrawl ||
415            this.pagesOutputted < this.input.maxResultsPerCrawl
416        )
417            return false;
418        if (!autoscaledPool) return false;
419        log.info(
420            `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
421        );
422        await autoscaledPool.abort();
423        return true;
424    }
425
426    private async _handleLinks({
427        request,
428        enqueueLinks,
429    }: PlaywrightCrawlingContext) {
430        if (!this.requestQueue) return;
431        const currentDepth = (request.userData![META_KEY] as RequestMetadata)
432            .depth;
433        const hasReachedMaxDepth =
434            this.input.maxCrawlingDepth &&
435            currentDepth >= this.input.maxCrawlingDepth;
436        if (hasReachedMaxDepth) {
437            log.debug(
438                `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
439            );
440            return;
441        }
442
443        const enqueueOptions: EnqueueLinksOptions = {
444            globs: this.input.globs,
445            pseudoUrls: this.input.pseudoUrls,
446            exclude: this.input.excludes,
447            transformRequestFunction: (requestOptions) => {
448                requestOptions.userData ??= {};
449                requestOptions.userData[META_KEY] = {
450                    parentRequestId: request.id || request.uniqueKey,
451                    depth: currentDepth + 1,
452                };
453
454                requestOptions.useExtendedUniqueKey = true;
455                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
456                return requestOptions;
457            },
458        };
459
460        if (this.input.linkSelector) {
461            await enqueueLinks({
462                ...enqueueOptions,
463                selector: this.input.linkSelector,
464            });
465        }
466    }
467
468    private async _handleResult(
469        request: Request,
470        response?: Response,
471        pageFunctionResult?: Dictionary,
472        isError?: boolean,
473    ) {
474        const payload = tools.createDatasetPayload(
475            request,
476            response,
477            pageFunctionResult,
478            isError,
479        );
480        await this.dataset.pushData(payload);
481        this.pagesOutputted++;
482    }
483}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.