1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksOptions,
8 PlaywrightCrawlerOptions,
9 PlaywrightCrawlingContext,
10 PlaywrightLaunchContext,
11 ProxyConfiguration,
12 Request,
13} from '@crawlee/playwright';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PlaywrightCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { Response } from 'playwright';
28import playwright from 'playwright';
29
30import type {
31 CrawlerSetupOptions,
32 RequestMetadata,
33} from '@apify/scraper-tools';
34import {
35 browserTools,
36 constants as scraperToolsConstants,
37 createContext,
38 tools,
39} from '@apify/scraper-tools';
40
41import type { Input } from './consts.js';
42import { ProxyRotation } from './consts.js';
43
44const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
45const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
46
47const {
48 META_KEY,
49 DEFAULT_VIEWPORT,
50 DEVTOOLS_TIMEOUT_SECS,
51 SESSION_MAX_USAGE_COUNTS,
52} = scraperToolsConstants;
53const SCHEMA = JSON.parse(
54 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
55);
56
57
58
59
60
61export class CrawlerSetup implements CrawlerSetupOptions {
62 name = 'Playwright Scraper';
63 rawInput: string;
64 env: ApifyEnv;
65
66
67
68 globalStore = new Map();
69 requestQueue: RequestQueueV2;
70 keyValueStore: KeyValueStore;
71 customData: unknown;
72 input: Input;
73 maxSessionUsageCount: number;
74 evaledPageFunction: (...args: unknown[]) => unknown;
75 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
76 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
77 blockedUrlPatterns: string[] = [];
78 devtools: boolean;
79 datasetName?: string;
80 keyValueStoreName?: string;
81 requestQueueName?: string;
82
83 crawler!: PlaywrightCrawler;
84 dataset!: Dataset;
85 pagesOutputted!: number;
86 private initPromise: Promise<void>;
87
88 constructor(input: Input) {
89
90 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
91
92
93 this.rawInput = JSON.stringify(input);
94
95
96 tools.maybeLoadPageFunctionFromDisk(
97 input,
98 dirname(fileURLToPath(import.meta.url)),
99 );
100
101
102 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
103
104 this.input = input;
105 this.env = Actor.getEnv();
106
107
108 this.input.pseudoUrls.forEach((purl) => {
109 if (!tools.isPlainObject(purl)) {
110 throw new Error(
111 'The pseudoUrls Array must only contain Objects.',
112 );
113 }
114 if (purl.userData && !tools.isPlainObject(purl.userData)) {
115 throw new Error(
116 'The userData property of a pseudoUrl must be an Object.',
117 );
118 }
119 });
120
121 if (this.input.useChrome && this.input.launcher !== 'chromium') {
122 throw new Error(
123 'useChrome option could only be used with Chromium browser selected.',
124 );
125 }
126
127 this.input.initialCookies.forEach((cookie) => {
128 if (!tools.isPlainObject(cookie)) {
129 throw new Error(
130 'The initialCookies Array must only contain Objects.',
131 );
132 }
133 });
134
135 if (
136 !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
137 ) {
138 throw new Error(
139 'Navigation wait until event must be valid. See tooltip.',
140 );
141 }
142
143
144 this.maxSessionUsageCount =
145 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
146
147
148 this.evaledPageFunction = tools.evalFunctionOrThrow(
149 this.input.pageFunction,
150 );
151
152 if (this.input.preNavigationHooks) {
153 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
154 this.input.preNavigationHooks,
155 'preNavigationHooks',
156 );
157 } else {
158 this.evaledPreNavigationHooks = [];
159 }
160
161 if (this.input.postNavigationHooks) {
162 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
163 this.input.postNavigationHooks,
164 'postNavigationHooks',
165 );
166 } else {
167 this.evaledPostNavigationHooks = [];
168 }
169
170
171 this.blockedUrlPatterns = [];
172 if (!this.input.downloadMedia) {
173 this.blockedUrlPatterns = [
174 '.jpg',
175 '.jpeg',
176 '.png',
177 '.svg',
178 '.gif',
179 '.webp',
180 '.webm',
181 '.ico',
182 '.woff',
183 '.eot',
184 ];
185 }
186
187 if (!this.input.downloadCss) {
188 this.blockedUrlPatterns.push('.css');
189 }
190
191
192 this.devtools = this.input.pageFunction.includes('debugger;');
193
194
195 this.datasetName = this.input.datasetName;
196 this.keyValueStoreName = this.input.keyValueStoreName;
197 this.requestQueueName = this.input.requestQueueName;
198
199
200 this.crawler = null!;
201 this.requestQueue = null!;
202 this.dataset = null!;
203 this.keyValueStore = null!;
204 this.initPromise = this._initializeAsync();
205 }
206
207 private async _initializeAsync() {
208
209 const startUrls = this.input.startUrls.map((req) => {
210 req.useExtendedUniqueKey = true;
211 req.keepUrlFragment = this.input.keepUrlFragments;
212 return req;
213 });
214
215
216 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
217
218
219 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
220
221 if (
222 !(await this.keyValueStore.recordExists(
223 REQUEST_QUEUE_INIT_FLAG_KEY,
224 ))
225 ) {
226 const requests: Request[] = [];
227 for await (const request of await RequestList.open(
228 null,
229 startUrls,
230 )) {
231 if (
232 this.input.maxResultsPerCrawl > 0 &&
233 requests.length >= 1.5 * this.input.maxResultsPerCrawl
234 ) {
235 break;
236 }
237 requests.push(request);
238 }
239
240 const { waitForAllRequestsToBeAdded } =
241 await this.requestQueue.addRequestsBatched(requests);
242
243 void waitForAllRequestsToBeAdded.then(async () => {
244 await this.keyValueStore.setValue(
245 REQUEST_QUEUE_INIT_FLAG_KEY,
246 '1',
247 );
248 });
249 }
250
251
252 this.dataset = await Dataset.open(this.datasetName);
253 const info = await this.dataset.getInfo();
254 this.pagesOutputted = info?.itemCount ?? 0;
255 }
256
257
258
259
260 async createCrawler() {
261 await this.initPromise;
262
263 const args = [];
264 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
265
266 const options: PlaywrightCrawlerOptions = {
267 requestHandler: this._requestHandler.bind(this),
268 requestQueue: this.requestQueue,
269 requestHandlerTimeoutSecs: this.devtools
270 ? DEVTOOLS_TIMEOUT_SECS
271 : this.input.pageFunctionTimeoutSecs,
272 preNavigationHooks: [],
273 postNavigationHooks: [],
274 failedRequestHandler: this._failedRequestHandler.bind(this),
275 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
276 maxConcurrency: this.input.maxConcurrency,
277 maxRequestRetries: this.input.maxRequestRetries,
278 maxRequestsPerCrawl:
279 this.input.maxPagesPerCrawl === 0
280 ? undefined
281 : this.input.maxPagesPerCrawl,
282 proxyConfiguration: (await Actor.createProxyConfiguration(
283 this.input.proxyConfiguration,
284 )) as any as ProxyConfiguration,
285 launchContext: {
286 useChrome: this.input.useChrome,
287 launcher: playwright[this.input.launcher],
288 launchOptions: {
289 ignoreHTTPSErrors: this.input.ignoreSslErrors,
290 bypassCSP: this.input.ignoreCorsAndCsp,
291 defaultViewport: DEFAULT_VIEWPORT,
292 devtools: this.devtools,
293 args,
294 headless: this.input.headless,
295 },
296 } as PlaywrightLaunchContext,
297 useSessionPool: true,
298 persistCookiesPerSession: true,
299 sessionPoolOptions: {
300 persistStateKeyValueStoreId: this.input.sessionPoolName
301 ? SESSION_STORE_NAME
302 : undefined,
303 persistStateKey: this.input.sessionPoolName,
304 sessionOptions: {
305 maxUsageCount: this.maxSessionUsageCount,
306 },
307 },
308 experiments: {
309 requestLocking: true,
310 },
311 };
312
313 this._createNavigationHooks(options);
314
315 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
316 options.sessionPoolOptions!.maxPoolSize = 1;
317 }
318
319 this.crawler = new PlaywrightCrawler(options);
320
321 return this.crawler;
322 }
323
324 private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
325 options.preNavigationHooks!.push(
326 async ({ request, page, session, blockRequests }, gotoOptions) => {
327
328 if (this.input.browserLog) browserTools.dumpConsole(page);
329
330
331 if (this.blockedUrlPatterns.length) {
332 await blockRequests({
333 urlPatterns: this.blockedUrlPatterns,
334 });
335 }
336
337
338 if (
339 this.input.initialCookies &&
340 this.input.initialCookies.length
341 ) {
342 const cookiesToSet = session
343 ? tools.getMissingCookiesFromSession(
344 session,
345 this.input.initialCookies,
346 request.url,
347 )
348 : this.input.initialCookies;
349
350 if (cookiesToSet?.length) {
351
352 session?.setCookies(cookiesToSet, request.url);
353 await page.context().addCookies(cookiesToSet);
354 }
355 }
356
357 if (gotoOptions) {
358 gotoOptions.timeout =
359 (this.devtools
360 ? DEVTOOLS_TIMEOUT_SECS
361 : this.input.pageLoadTimeoutSecs) * 1000;
362 gotoOptions.waitUntil = this.input.waitUntil;
363 }
364 },
365 );
366
367 options.preNavigationHooks!.push(
368 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
369 );
370 options.postNavigationHooks!.push(
371 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
372 );
373 }
374
375 private _runHookWithEnhancedContext(
376 hooks: ((...args: unknown[]) => Awaitable<void>)[],
377 ) {
378 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
379 const { customData } = this.input;
380 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
381 });
382 }
383
384 private async _failedRequestHandler({
385 request,
386 }: PlaywrightCrawlingContext) {
387 const lastError =
388 request.errorMessages[request.errorMessages.length - 1];
389 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
390 log.error(
391 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
392 );
393 return this._handleResult(request, undefined, undefined, true);
394 }
395
396
397
398
399
400
401
402
403
404
405
406 private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
407 const { request, response, crawler } = crawlingContext;
408
409
410
411
412
413
414 tools.ensureMetaData(request);
415
416
417 const aborted = await this._handleMaxResultsPerCrawl(
418 crawler.autoscaledPool,
419 );
420 if (aborted) return;
421
422 const pageFunctionArguments: Dictionary = {};
423
424
425 Object.defineProperties(
426 pageFunctionArguments,
427 Object.getOwnPropertyDescriptors(crawlingContext),
428 );
429
430 pageFunctionArguments.response = {
431 status: response && response.status(),
432 headers: response && response.headers(),
433 };
434
435
436 const contextOptions = {
437 crawlerSetup: {
438 rawInput: this.rawInput,
439 env: this.env,
440 globalStore: this.globalStore,
441 requestQueue: this.requestQueue,
442 keyValueStore: this.keyValueStore,
443 customData: this.input.customData,
444 },
445 pageFunctionArguments,
446 };
447 const { context, state } = createContext(contextOptions);
448
449 if (this.input.closeCookieModals) {
450 await sleep(500);
451 await crawlingContext.page.evaluate(getInjectableScript());
452 await sleep(2000);
453 }
454
455 if (this.input.maxScrollHeightPixels > 0) {
456 await crawlingContext.infiniteScroll({
457 maxScrollHeight: this.input.maxScrollHeightPixels,
458 });
459 }
460
461
462
463
464 const pageFunctionResult = await this.evaledPageFunction(context);
465
466
467
468
469
470
471
472 if (!state.skipLinks) await this._handleLinks(crawlingContext);
473
474
475 await this._handleResult(
476 request,
477 response,
478 pageFunctionResult as Dictionary,
479 );
480 }
481
482 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
483 if (
484 !this.input.maxResultsPerCrawl ||
485 this.pagesOutputted < this.input.maxResultsPerCrawl
486 )
487 return false;
488 if (!autoscaledPool) return false;
489 log.info(
490 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
491 );
492 await autoscaledPool.abort();
493 return true;
494 }
495
496 private async _handleLinks({
497 request,
498 enqueueLinks,
499 }: PlaywrightCrawlingContext) {
500 if (!this.requestQueue) return;
501 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
502 .depth;
503 const hasReachedMaxDepth =
504 this.input.maxCrawlingDepth &&
505 currentDepth >= this.input.maxCrawlingDepth;
506 if (hasReachedMaxDepth) {
507 log.debug(
508 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
509 );
510 return;
511 }
512
513 const enqueueOptions: EnqueueLinksOptions = {
514 globs: this.input.globs,
515 pseudoUrls: this.input.pseudoUrls,
516 exclude: this.input.excludes,
517 transformRequestFunction: (requestOptions) => {
518 requestOptions.userData ??= {};
519 requestOptions.userData[META_KEY] = {
520 parentRequestId: request.id || request.uniqueKey,
521 depth: currentDepth + 1,
522 };
523
524 requestOptions.useExtendedUniqueKey = true;
525 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
526 return requestOptions;
527 },
528 };
529
530 if (this.input.linkSelector) {
531 await enqueueLinks({
532 ...enqueueOptions,
533 selector: this.input.linkSelector,
534 });
535 }
536 }
537
538 private async _handleResult(
539 request: Request,
540 response?: Response,
541 pageFunctionResult?: Dictionary,
542 isError?: boolean,
543 ) {
544 const payload = tools.createDatasetPayload(
545 request,
546 response,
547 pageFunctionResult,
548 isError,
549 );
550 await this.dataset.pushData(payload);
551 this.pagesOutputted++;
552 }
553}