1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6    AutoscaledPool,
7    EnqueueLinksOptions,
8    PlaywrightCrawlerOptions,
9    PlaywrightCrawlingContext,
10    PlaywrightLaunchContext,
11    ProxyConfiguration,
12    Request,
13} from '@crawlee/playwright';
14import {
15    Dataset,
16    KeyValueStore,
17    log,
18    PlaywrightCrawler,
19    RequestList,
20    RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { Response } from 'playwright';
28import playwright from 'playwright';
29
30import type {
31    CrawlerSetupOptions,
32    RequestMetadata,
33} from '@apify/scraper-tools';
34import {
35    browserTools,
36    constants as scraperToolsConstants,
37    createContext,
38    tools,
39} from '@apify/scraper-tools';
40
41import type { Input } from './consts.js';
42import { ProxyRotation } from './consts.js';
43
44const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
45const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
46
47const {
48    META_KEY,
49    DEFAULT_VIEWPORT,
50    DEVTOOLS_TIMEOUT_SECS,
51    SESSION_MAX_USAGE_COUNTS,
52} = scraperToolsConstants;
53const SCHEMA = JSON.parse(
54    await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
55);
56
57
58
59
60
61export class CrawlerSetup implements CrawlerSetupOptions {
62    name = 'Playwright Scraper';
63    rawInput: string;
64    env: ApifyEnv;
65    
66
67
68    globalStore = new Map();
69    requestQueue: RequestQueueV2;
70    keyValueStore: KeyValueStore;
71    customData: unknown;
72    input: Input;
73    maxSessionUsageCount: number;
74    evaledPageFunction: (...args: unknown[]) => unknown;
75    evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
76    evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
77    blockedUrlPatterns: string[] = [];
78    devtools: boolean;
79    datasetName?: string;
80    keyValueStoreName?: string;
81    requestQueueName?: string;
82
83    crawler!: PlaywrightCrawler;
84    dataset!: Dataset;
85    pagesOutputted!: number;
86    private initPromise: Promise<void>;
87
88    constructor(input: Input) {
89        
90        if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
91
92        
93        this.rawInput = JSON.stringify(input);
94
95        
96        tools.maybeLoadPageFunctionFromDisk(
97            input,
98            dirname(fileURLToPath(import.meta.url)),
99        );
100
101        
102        if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
103
104        this.input = input;
105        this.env = Actor.getEnv();
106
107        
108        this.input.pseudoUrls.forEach((purl) => {
109            if (!tools.isPlainObject(purl)) {
110                throw new Error(
111                    'The pseudoUrls Array must only contain Objects.',
112                );
113            }
114            if (purl.userData && !tools.isPlainObject(purl.userData)) {
115                throw new Error(
116                    'The userData property of a pseudoUrl must be an Object.',
117                );
118            }
119        });
120
121        if (this.input.useChrome && this.input.launcher !== 'chromium') {
122            throw new Error(
123                'useChrome option could only be used with Chromium browser selected.',
124            );
125        }
126
127        this.input.initialCookies.forEach((cookie) => {
128            if (!tools.isPlainObject(cookie)) {
129                throw new Error(
130                    'The initialCookies Array must only contain Objects.',
131                );
132            }
133        });
134
135        if (
136            !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
137        ) {
138            throw new Error(
139                'Navigation wait until event must be valid. See tooltip.',
140            );
141        }
142
143        
144        this.maxSessionUsageCount =
145            SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
146
147        
148        this.evaledPageFunction = tools.evalFunctionOrThrow(
149            this.input.pageFunction,
150        );
151
152        if (this.input.preNavigationHooks) {
153            this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
154                this.input.preNavigationHooks,
155                'preNavigationHooks',
156            );
157        } else {
158            this.evaledPreNavigationHooks = [];
159        }
160
161        if (this.input.postNavigationHooks) {
162            this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
163                this.input.postNavigationHooks,
164                'postNavigationHooks',
165            );
166        } else {
167            this.evaledPostNavigationHooks = [];
168        }
169
170        
171        this.blockedUrlPatterns = [];
172        if (!this.input.downloadMedia) {
173            this.blockedUrlPatterns = [
174                '.jpg',
175                '.jpeg',
176                '.png',
177                '.svg',
178                '.gif',
179                '.webp',
180                '.webm',
181                '.ico',
182                '.woff',
183                '.eot',
184            ];
185        }
186
187        if (!this.input.downloadCss) {
188            this.blockedUrlPatterns.push('.css');
189        }
190
191        
192        this.devtools = this.input.pageFunction.includes('debugger;');
193
194        
195        this.datasetName = this.input.datasetName;
196        this.keyValueStoreName = this.input.keyValueStoreName;
197        this.requestQueueName = this.input.requestQueueName;
198
199        
200        this.crawler = null!;
201        this.requestQueue = null!;
202        this.dataset = null!;
203        this.keyValueStore = null!;
204        this.initPromise = this._initializeAsync();
205    }
206
207    private async _initializeAsync() {
208        
209        const startUrls = this.input.startUrls.map((req) => {
210            req.useExtendedUniqueKey = true;
211            req.keepUrlFragment = this.input.keepUrlFragments;
212            return req;
213        });
214
215        
216        this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
217
218        
219        this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
220
221        if (
222            !(await this.keyValueStore.recordExists(
223                REQUEST_QUEUE_INIT_FLAG_KEY,
224            ))
225        ) {
226            const requests: Request[] = [];
227            for await (const request of await RequestList.open(
228                null,
229                startUrls,
230            )) {
231                if (
232                    this.input.maxResultsPerCrawl > 0 &&
233                    requests.length >= 1.5 * this.input.maxResultsPerCrawl
234                ) {
235                    break;
236                }
237                requests.push(request);
238            }
239
240            const { waitForAllRequestsToBeAdded } =
241                await this.requestQueue.addRequestsBatched(requests);
242
243            void waitForAllRequestsToBeAdded.then(async () => {
244                await this.keyValueStore.setValue(
245                    REQUEST_QUEUE_INIT_FLAG_KEY,
246                    '1',
247                );
248            });
249        }
250
251        
252        this.dataset = await Dataset.open(this.datasetName);
253        const info = await this.dataset.getInfo();
254        this.pagesOutputted = info?.itemCount ?? 0;
255    }
256
257    
258
259
260    async createCrawler() {
261        await this.initPromise;
262
263        const args = [];
264        if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
265
266        const options: PlaywrightCrawlerOptions = {
267            requestHandler: this._requestHandler.bind(this),
268            requestQueue: this.requestQueue,
269            requestHandlerTimeoutSecs: this.devtools
270                ? DEVTOOLS_TIMEOUT_SECS
271                : this.input.pageFunctionTimeoutSecs,
272            preNavigationHooks: [],
273            postNavigationHooks: [],
274            failedRequestHandler: this._failedRequestHandler.bind(this),
275            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
276            maxConcurrency: this.input.maxConcurrency,
277            maxRequestRetries: this.input.maxRequestRetries,
278            maxRequestsPerCrawl:
279                this.input.maxPagesPerCrawl === 0
280                    ? undefined
281                    : this.input.maxPagesPerCrawl,
282            proxyConfiguration: (await Actor.createProxyConfiguration(
283                this.input.proxyConfiguration,
284            )) as any as ProxyConfiguration,
285            launchContext: {
286                useChrome: this.input.useChrome,
287                launcher: playwright[this.input.launcher],
288                launchOptions: {
289                    ignoreHTTPSErrors: this.input.ignoreSslErrors,
290                    bypassCSP: this.input.ignoreCorsAndCsp,
291                    defaultViewport: DEFAULT_VIEWPORT,
292                    devtools: this.devtools,
293                    args,
294                    headless: this.input.headless,
295                },
296            } as PlaywrightLaunchContext,
297            useSessionPool: true,
298            persistCookiesPerSession: true,
299            sessionPoolOptions: {
300                persistStateKeyValueStoreId: this.input.sessionPoolName
301                    ? SESSION_STORE_NAME
302                    : undefined,
303                persistStateKey: this.input.sessionPoolName,
304                sessionOptions: {
305                    maxUsageCount: this.maxSessionUsageCount,
306                },
307            },
308            experiments: {
309                requestLocking: true,
310            },
311        };
312
313        this._createNavigationHooks(options);
314
315        if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
316            options.sessionPoolOptions!.maxPoolSize = 1;
317        }
318
319        this.crawler = new PlaywrightCrawler(options);
320
321        return this.crawler;
322    }
323
324    private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
325        options.preNavigationHooks!.push(
326            async ({ request, page, session, blockRequests }, gotoOptions) => {
327                
328                if (this.input.browserLog) browserTools.dumpConsole(page);
329
330                
331                if (this.blockedUrlPatterns.length) {
332                    await blockRequests({
333                        urlPatterns: this.blockedUrlPatterns,
334                    });
335                }
336
337                
338                if (
339                    this.input.initialCookies &&
340                    this.input.initialCookies.length
341                ) {
342                    const cookiesToSet = session
343                        ? tools.getMissingCookiesFromSession(
344                              session,
345                              this.input.initialCookies,
346                              request.url,
347                          )
348                        : this.input.initialCookies;
349
350                    if (cookiesToSet?.length) {
351                        
352                        session?.setCookies(cookiesToSet, request.url);
353                        await page.context().addCookies(cookiesToSet);
354                    }
355                }
356
357                if (gotoOptions) {
358                    gotoOptions.timeout =
359                        (this.devtools
360                            ? DEVTOOLS_TIMEOUT_SECS
361                            : this.input.pageLoadTimeoutSecs) * 1000;
362                    gotoOptions.waitUntil = this.input.waitUntil;
363                }
364            },
365        );
366
367        options.preNavigationHooks!.push(
368            ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
369        );
370        options.postNavigationHooks!.push(
371            ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
372        );
373    }
374
375    private _runHookWithEnhancedContext(
376        hooks: ((...args: unknown[]) => Awaitable<void>)[],
377    ) {
378        return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
379            const { customData } = this.input;
380            return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
381        });
382    }
383
384    private async _failedRequestHandler({
385        request,
386    }: PlaywrightCrawlingContext) {
387        const lastError =
388            request.errorMessages[request.errorMessages.length - 1];
389        const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
390        log.error(
391            `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
392        );
393        return this._handleResult(request, undefined, undefined, true);
394    }
395
396    
397
398
399
400
401
402
403
404
405
406    private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
407        const { request, response, crawler } = crawlingContext;
408
409        
410
411
412        
413        
414        tools.ensureMetaData(request);
415
416        
417        const aborted = await this._handleMaxResultsPerCrawl(
418            crawler.autoscaledPool,
419        );
420        if (aborted) return;
421
422        const pageFunctionArguments: Dictionary = {};
423
424        
425        Object.defineProperties(
426            pageFunctionArguments,
427            Object.getOwnPropertyDescriptors(crawlingContext),
428        );
429
430        pageFunctionArguments.response = {
431            status: response && response.status(),
432            headers: response && response.headers(),
433        };
434
435        
436        const contextOptions = {
437            crawlerSetup: {
438                rawInput: this.rawInput,
439                env: this.env,
440                globalStore: this.globalStore,
441                requestQueue: this.requestQueue,
442                keyValueStore: this.keyValueStore,
443                customData: this.input.customData,
444            },
445            pageFunctionArguments,
446        };
447        const { context, state } = createContext(contextOptions);
448
449        if (this.input.closeCookieModals) {
450            await sleep(500);
451            await crawlingContext.page.evaluate(getInjectableScript());
452            await sleep(2000);
453        }
454
455        if (this.input.maxScrollHeightPixels > 0) {
456            await crawlingContext.infiniteScroll({
457                maxScrollHeight: this.input.maxScrollHeightPixels,
458            });
459        }
460
461        
462
463
464        const pageFunctionResult = await this.evaledPageFunction(context);
465
466        
467
468
469        
470        
471        
472        if (!state.skipLinks) await this._handleLinks(crawlingContext);
473
474        
475        await this._handleResult(
476            request,
477            response,
478            pageFunctionResult as Dictionary,
479        );
480    }
481
482    private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
483        if (
484            !this.input.maxResultsPerCrawl ||
485            this.pagesOutputted < this.input.maxResultsPerCrawl
486        )
487            return false;
488        if (!autoscaledPool) return false;
489        log.info(
490            `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
491        );
492        await autoscaledPool.abort();
493        return true;
494    }
495
496    private async _handleLinks({
497        request,
498        enqueueLinks,
499    }: PlaywrightCrawlingContext) {
500        if (!this.requestQueue) return;
501        const currentDepth = (request.userData![META_KEY] as RequestMetadata)
502            .depth;
503        const hasReachedMaxDepth =
504            this.input.maxCrawlingDepth &&
505            currentDepth >= this.input.maxCrawlingDepth;
506        if (hasReachedMaxDepth) {
507            log.debug(
508                `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
509            );
510            return;
511        }
512
513        const enqueueOptions: EnqueueLinksOptions = {
514            globs: this.input.globs,
515            pseudoUrls: this.input.pseudoUrls,
516            exclude: this.input.excludes,
517            transformRequestFunction: (requestOptions) => {
518                requestOptions.userData ??= {};
519                requestOptions.userData[META_KEY] = {
520                    parentRequestId: request.id || request.uniqueKey,
521                    depth: currentDepth + 1,
522                };
523
524                requestOptions.useExtendedUniqueKey = true;
525                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
526                return requestOptions;
527            },
528        };
529
530        if (this.input.linkSelector) {
531            await enqueueLinks({
532                ...enqueueOptions,
533                selector: this.input.linkSelector,
534            });
535        }
536    }
537
538    private async _handleResult(
539        request: Request,
540        response?: Response,
541        pageFunctionResult?: Dictionary,
542        isError?: boolean,
543    ) {
544        const payload = tools.createDatasetPayload(
545            request,
546            response,
547            pageFunctionResult,
548            isError,
549        );
550        await this.dataset.pushData(payload);
551        this.pagesOutputted++;
552    }
553}