1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksOptions,
8 PlaywrightCrawlerOptions,
9 PlaywrightCrawlingContext,
10 PlaywrightLaunchContext,
11 ProxyConfiguration,
12 Request,
13} from '@crawlee/playwright';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PlaywrightCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { Response } from 'playwright';
28import playwright from 'playwright';
29
30import type {
31 CrawlerSetupOptions,
32 RequestMetadata,
33} from '@apify/scraper-tools';
34import {
35 browserTools,
36 constants as scraperToolsConstants,
37 createContext,
38 tools,
39} from '@apify/scraper-tools';
40
41import type { Input } from './consts.js';
42import { ProxyRotation } from './consts.js';
43
44const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
45const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
46
47const {
48 META_KEY,
49 DEFAULT_VIEWPORT,
50 DEVTOOLS_TIMEOUT_SECS,
51 SESSION_MAX_USAGE_COUNTS,
52} = scraperToolsConstants;
53const SCHEMA = JSON.parse(
54 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
55);
56
57
58
59
60
61export class CrawlerSetup implements CrawlerSetupOptions {
62 name = 'Playwright Scraper';
63 rawInput: string;
64 env: ApifyEnv;
65
66
67
68 globalStore = new Map();
69 requestQueue: RequestQueueV2;
70 keyValueStore: KeyValueStore;
71 customData: unknown;
72 input: Input;
73 maxSessionUsageCount: number;
74 evaledPageFunction: (...args: unknown[]) => unknown;
75 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
76 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
77 blockedUrlPatterns: string[] = [];
78 devtools: boolean;
79 datasetName?: string;
80 keyValueStoreName?: string;
81 requestQueueName?: string;
82
83 crawler!: PlaywrightCrawler;
84 dataset!: Dataset;
85 pagesOutputted!: number;
86 private initPromise: Promise<void>;
87
88 constructor(input: Input) {
89
90 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
91
92
93 this.rawInput = JSON.stringify(input);
94
95
96 tools.maybeLoadPageFunctionFromDisk(
97 input,
98 dirname(fileURLToPath(import.meta.url)),
99 );
100
101
102 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
103
104 this.input = input;
105 this.env = Actor.getEnv();
106
107
108 this.input.pseudoUrls.forEach((purl) => {
109 if (!tools.isPlainObject(purl)) {
110 throw new Error(
111 'The pseudoUrls Array must only contain Objects.',
112 );
113 }
114 if (purl.userData && !tools.isPlainObject(purl.userData)) {
115 throw new Error(
116 'The userData property of a pseudoUrl must be an Object.',
117 );
118 }
119 });
120
121 if (this.input.useChrome && this.input.launcher !== 'chromium') {
122 throw new Error(
123 'useChrome option could only be used with Chromium browser selected.',
124 );
125 }
126
127 this.input.initialCookies.forEach((cookie) => {
128 if (!tools.isPlainObject(cookie)) {
129 throw new Error(
130 'The initialCookies Array must only contain Objects.',
131 );
132 }
133 });
134
135 if (
136 !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
137 ) {
138 throw new Error(
139 'Navigation wait until event must be valid. See tooltip.',
140 );
141 }
142
143
144 this.maxSessionUsageCount =
145 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
146
147
148 this.evaledPageFunction = tools.evalFunctionOrThrow(
149 this.input.pageFunction,
150 );
151
152 if (this.input.preNavigationHooks) {
153 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
154 this.input.preNavigationHooks,
155 'preNavigationHooks',
156 );
157 } else {
158 this.evaledPreNavigationHooks = [];
159 }
160
161 if (this.input.postNavigationHooks) {
162 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
163 this.input.postNavigationHooks,
164 'postNavigationHooks',
165 );
166 } else {
167 this.evaledPostNavigationHooks = [];
168 }
169
170
171 this.blockedUrlPatterns = [];
172 if (!this.input.downloadMedia) {
173 this.blockedUrlPatterns = [
174 '.jpg',
175 '.jpeg',
176 '.png',
177 '.svg',
178 '.gif',
179 '.webp',
180 '.webm',
181 '.ico',
182 '.woff',
183 '.eot',
184 ];
185 }
186
187 if (!this.input.downloadCss) {
188 this.blockedUrlPatterns.push('.css');
189 }
190
191
192 this.devtools = this.input.pageFunction.includes('debugger;');
193
194
195 this.datasetName = this.input.datasetName;
196 this.keyValueStoreName = this.input.keyValueStoreName;
197 this.requestQueueName = this.input.requestQueueName;
198
199
200 this.crawler = null!;
201 this.requestQueue = null!;
202 this.dataset = null!;
203 this.keyValueStore = null!;
204 this.initPromise = this._initializeAsync();
205 }
206
207 private async _initializeAsync() {
208
209 const startUrls = this.input.startUrls.map((req) => {
210 req.useExtendedUniqueKey = true;
211 req.keepUrlFragment = this.input.keepUrlFragments;
212 return req;
213 });
214
215
216 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
217
218
219 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
220
221 if (
222 !(await this.keyValueStore.recordExists(
223 REQUEST_QUEUE_INIT_FLAG_KEY,
224 ))
225 ) {
226 const requests: Request[] = [];
227 for await (const request of await RequestList.open(
228 null,
229 startUrls,
230 )) {
231 if (
232 this.input.maxResultsPerCrawl > 0 &&
233 requests.length >= 1.5 * this.input.maxResultsPerCrawl
234 ) {
235 break;
236 }
237 requests.push(request);
238 }
239
240 const { waitForAllRequestsToBeAdded } =
241 await this.requestQueue.addRequestsBatched(requests);
242
243 void waitForAllRequestsToBeAdded.then(async () => {
244 await this.keyValueStore.setValue(
245 REQUEST_QUEUE_INIT_FLAG_KEY,
246 '1',
247 );
248 });
249 }
250
251
252 this.dataset = await Dataset.open(this.datasetName);
253 const info = await this.dataset.getInfo();
254 this.pagesOutputted = info?.itemCount ?? 0;
255 }
256
257
258
259
260 async createCrawler() {
261 await this.initPromise;
262
263 const args = [];
264 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
265
266 const options: PlaywrightCrawlerOptions = {
267 requestHandler: this._requestHandler.bind(this),
268 requestQueue: this.requestQueue,
269 requestHandlerTimeoutSecs: this.devtools
270 ? DEVTOOLS_TIMEOUT_SECS
271 : this.input.pageFunctionTimeoutSecs,
272 preNavigationHooks: [],
273 postNavigationHooks: [],
274 failedRequestHandler: this._failedRequestHandler.bind(this),
275 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
276 maxConcurrency: this.input.maxConcurrency,
277 maxRequestRetries: this.input.maxRequestRetries,
278 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
279 proxyConfiguration: (await Actor.createProxyConfiguration(
280 this.input.proxyConfiguration,
281 )) as any as ProxyConfiguration,
282 launchContext: {
283 useChrome: this.input.useChrome,
284 launcher: playwright[this.input.launcher],
285 launchOptions: {
286 ignoreHTTPSErrors: this.input.ignoreSslErrors,
287 bypassCSP: this.input.ignoreCorsAndCsp,
288 defaultViewport: DEFAULT_VIEWPORT,
289 devtools: this.devtools,
290 args,
291 headless: this.input.headless,
292 },
293 } as PlaywrightLaunchContext,
294 useSessionPool: true,
295 persistCookiesPerSession: true,
296 sessionPoolOptions: {
297 persistStateKeyValueStoreId: this.input.sessionPoolName
298 ? SESSION_STORE_NAME
299 : undefined,
300 persistStateKey: this.input.sessionPoolName,
301 sessionOptions: {
302 maxUsageCount: this.maxSessionUsageCount,
303 },
304 },
305 experiments: {
306 requestLocking: true,
307 },
308 };
309
310 this._createNavigationHooks(options);
311
312 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
313 options.sessionPoolOptions!.maxPoolSize = 1;
314 }
315
316 this.crawler = new PlaywrightCrawler(options);
317
318 return this.crawler;
319 }
320
321 private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
322 options.preNavigationHooks!.push(
323 async ({ request, page, session, blockRequests }, gotoOptions) => {
324
325 if (this.input.browserLog) browserTools.dumpConsole(page);
326
327
328 if (this.blockedUrlPatterns.length) {
329 await blockRequests({
330 urlPatterns: this.blockedUrlPatterns,
331 });
332 }
333
334
335 if (
336 this.input.initialCookies &&
337 this.input.initialCookies.length
338 ) {
339 const cookiesToSet = session
340 ? tools.getMissingCookiesFromSession(
341 session,
342 this.input.initialCookies,
343 request.url,
344 )
345 : this.input.initialCookies;
346
347 if (cookiesToSet?.length) {
348
349 session?.setCookies(cookiesToSet, request.url);
350 await page.context().addCookies(cookiesToSet);
351 }
352 }
353
354 if (gotoOptions) {
355 gotoOptions.timeout =
356 (this.devtools
357 ? DEVTOOLS_TIMEOUT_SECS
358 : this.input.pageLoadTimeoutSecs) * 1000;
359 gotoOptions.waitUntil = this.input.waitUntil;
360 }
361 },
362 );
363
364 options.preNavigationHooks!.push(
365 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
366 );
367 options.postNavigationHooks!.push(
368 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
369 );
370 }
371
372 private _runHookWithEnhancedContext(
373 hooks: ((...args: unknown[]) => Awaitable<void>)[],
374 ) {
375 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
376 const { customData } = this.input;
377 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
378 });
379 }
380
381 private async _failedRequestHandler({
382 request,
383 }: PlaywrightCrawlingContext) {
384 const lastError =
385 request.errorMessages[request.errorMessages.length - 1];
386 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
387 log.error(
388 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
389 );
390 return this._handleResult(request, undefined, undefined, true);
391 }
392
393
394
395
396
397
398
399
400
401
402
403 private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
404 const { request, response, crawler } = crawlingContext;
405
406
407
408
409
410
411 tools.ensureMetaData(request);
412
413
414 const aborted = await this._handleMaxResultsPerCrawl(
415 crawler.autoscaledPool,
416 );
417 if (aborted) return;
418
419 const pageFunctionArguments: Dictionary = {};
420
421
422 Object.defineProperties(
423 pageFunctionArguments,
424 Object.getOwnPropertyDescriptors(crawlingContext),
425 );
426
427 pageFunctionArguments.response = {
428 status: response && response.status(),
429 headers: response && response.headers(),
430 };
431
432
433 const contextOptions = {
434 crawlerSetup: {
435 rawInput: this.rawInput,
436 env: this.env,
437 globalStore: this.globalStore,
438 requestQueue: this.requestQueue,
439 keyValueStore: this.keyValueStore,
440 customData: this.input.customData,
441 },
442 pageFunctionArguments,
443 };
444 const { context, state } = createContext(contextOptions);
445
446 if (this.input.closeCookieModals) {
447 await sleep(500);
448 await crawlingContext.page.evaluate(getInjectableScript());
449 await sleep(2000);
450 }
451
452 if (this.input.maxScrollHeightPixels > 0) {
453 await crawlingContext.infiniteScroll({
454 maxScrollHeight: this.input.maxScrollHeightPixels,
455 });
456 }
457
458
459
460
461 const pageFunctionResult = await this.evaledPageFunction(context);
462
463
464
465
466
467
468
469 if (!state.skipLinks) await this._handleLinks(crawlingContext);
470
471
472 await this._handleResult(
473 request,
474 response,
475 pageFunctionResult as Dictionary,
476 );
477 }
478
479 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
480 if (
481 !this.input.maxResultsPerCrawl ||
482 this.pagesOutputted < this.input.maxResultsPerCrawl
483 )
484 return false;
485 if (!autoscaledPool) return false;
486 log.info(
487 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
488 );
489 await autoscaledPool.abort();
490 return true;
491 }
492
493 private async _handleLinks({
494 request,
495 enqueueLinks,
496 }: PlaywrightCrawlingContext) {
497 if (!this.requestQueue) return;
498 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
499 .depth;
500 const hasReachedMaxDepth =
501 this.input.maxCrawlingDepth &&
502 currentDepth >= this.input.maxCrawlingDepth;
503 if (hasReachedMaxDepth) {
504 log.debug(
505 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
506 );
507 return;
508 }
509
510 const enqueueOptions: EnqueueLinksOptions = {
511 globs: this.input.globs,
512 pseudoUrls: this.input.pseudoUrls,
513 exclude: this.input.excludes,
514 transformRequestFunction: (requestOptions) => {
515 requestOptions.userData ??= {};
516 requestOptions.userData[META_KEY] = {
517 parentRequestId: request.id || request.uniqueKey,
518 depth: currentDepth + 1,
519 };
520
521 requestOptions.useExtendedUniqueKey = true;
522 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
523 return requestOptions;
524 },
525 };
526
527 if (this.input.linkSelector) {
528 await enqueueLinks({
529 ...enqueueOptions,
530 selector: this.input.linkSelector,
531 });
532 }
533 }
534
535 private async _handleResult(
536 request: Request,
537 response?: Response,
538 pageFunctionResult?: Dictionary,
539 isError?: boolean,
540 ) {
541 const payload = tools.createDatasetPayload(
542 request,
543 response,
544 pageFunctionResult,
545 isError,
546 );
547 await this.dataset.pushData(payload);
548 this.pagesOutputted++;
549 }
550}