1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksOptions,
8 PlaywrightCrawlerOptions,
9 PlaywrightCrawlingContext,
10 PlaywrightLaunchContext,
11 ProxyConfiguration,
12 Request,
13} from '@crawlee/playwright';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PlaywrightCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { launchOptions } from 'camoufox-js';
27import { getInjectableScript } from 'idcac-playwright';
28import type { Response } from 'playwright';
29import { firefox } from 'playwright';
30
31import type {
32 CrawlerSetupOptions,
33 RequestMetadata,
34} from '@apify/scraper-tools';
35import {
36 browserTools,
37 constants as scraperToolsConstants,
38 createContext,
39 tools,
40} from '@apify/scraper-tools';
41
42import type { Input } from './consts.js';
43import { ProxyRotation } from './consts.js';
44
45const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
46const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
47
48const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
49 scraperToolsConstants;
50const SCHEMA = JSON.parse(
51 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
52);
53
54
55
56
57
58export class CrawlerSetup implements CrawlerSetupOptions {
59 name = 'Playwright Scraper';
60 rawInput: string;
61 env: ApifyEnv;
62
63
64
65 globalStore = new Map();
66 requestQueue: RequestQueueV2;
67 keyValueStore: KeyValueStore;
68 customData: unknown;
69 input: Input;
70 maxSessionUsageCount: number;
71 evaledPageFunction: (...args: unknown[]) => unknown;
72 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
73 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
74 devtools: boolean;
75 datasetName?: string;
76 keyValueStoreName?: string;
77 requestQueueName?: string;
78
79 crawler!: PlaywrightCrawler;
80 dataset!: Dataset;
81 pagesOutputted!: number;
82 private initPromise: Promise<void>;
83
84 constructor(input: Input) {
85
86 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
87
88
89 this.rawInput = JSON.stringify(input);
90
91
92 tools.maybeLoadPageFunctionFromDisk(
93 input,
94 dirname(fileURLToPath(import.meta.url)),
95 );
96
97
98 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
99
100 this.input = input;
101 this.env = Actor.getEnv();
102
103
104 this.input.pseudoUrls.forEach((purl) => {
105 if (!tools.isPlainObject(purl)) {
106 throw new Error(
107 'The pseudoUrls Array must only contain Objects.',
108 );
109 }
110 if (purl.userData && !tools.isPlainObject(purl.userData)) {
111 throw new Error(
112 'The userData property of a pseudoUrl must be an Object.',
113 );
114 }
115 });
116
117 this.input.initialCookies.forEach((cookie) => {
118 if (!tools.isPlainObject(cookie)) {
119 throw new Error(
120 'The initialCookies Array must only contain Objects.',
121 );
122 }
123 });
124
125 if (
126 !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
127 ) {
128 throw new Error(
129 'Navigation wait until event must be valid. See tooltip.',
130 );
131 }
132
133
134 this.maxSessionUsageCount =
135 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
136
137
138 this.evaledPageFunction = tools.evalFunctionOrThrow(
139 this.input.pageFunction,
140 );
141
142 if (this.input.preNavigationHooks) {
143 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
144 this.input.preNavigationHooks,
145 'preNavigationHooks',
146 );
147 } else {
148 this.evaledPreNavigationHooks = [];
149 }
150
151 if (this.input.postNavigationHooks) {
152 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
153 this.input.postNavigationHooks,
154 'postNavigationHooks',
155 );
156 } else {
157 this.evaledPostNavigationHooks = [];
158 }
159
160
161 this.devtools = this.input.pageFunction.includes('debugger;');
162
163
164 this.datasetName = this.input.datasetName;
165 this.keyValueStoreName = this.input.keyValueStoreName;
166 this.requestQueueName = this.input.requestQueueName;
167
168
169 this.crawler = null!;
170 this.requestQueue = null!;
171 this.dataset = null!;
172 this.keyValueStore = null!;
173 this.initPromise = this._initializeAsync();
174 }
175
176 private async _initializeAsync() {
177
178 const startUrls = this.input.startUrls.map((req) => {
179 req.useExtendedUniqueKey = true;
180 req.keepUrlFragment = this.input.keepUrlFragments;
181 return req;
182 });
183
184
185 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
186
187
188 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
189
190 if (
191 !(await this.keyValueStore.recordExists(
192 REQUEST_QUEUE_INIT_FLAG_KEY,
193 ))
194 ) {
195 const requests: Request[] = [];
196 for await (const request of await RequestList.open(
197 null,
198 startUrls,
199 )) {
200 if (
201 this.input.maxResultsPerCrawl > 0 &&
202 requests.length >= 1.5 * this.input.maxResultsPerCrawl
203 ) {
204 break;
205 }
206 requests.push(request);
207 }
208
209 const { waitForAllRequestsToBeAdded } =
210 await this.requestQueue.addRequestsBatched(requests);
211
212 void waitForAllRequestsToBeAdded.then(async () => {
213 await this.keyValueStore.setValue(
214 REQUEST_QUEUE_INIT_FLAG_KEY,
215 '1',
216 );
217 });
218 }
219
220
221 this.dataset = await Dataset.open(this.datasetName);
222 const info = await this.dataset.getInfo();
223 this.pagesOutputted = info?.itemCount ?? 0;
224 }
225
226
227
228
229 async createCrawler() {
230 await this.initPromise;
231
232 const options: PlaywrightCrawlerOptions = {
233 requestHandler: this._requestHandler.bind(this),
234 requestQueue: this.requestQueue,
235 requestHandlerTimeoutSecs: this.devtools
236 ? DEVTOOLS_TIMEOUT_SECS
237 : this.input.pageFunctionTimeoutSecs,
238 preNavigationHooks: [],
239 postNavigationHooks: [],
240 failedRequestHandler: this._failedRequestHandler.bind(this),
241 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
242 maxConcurrency: this.input.maxConcurrency,
243 maxRequestRetries: this.input.maxRequestRetries,
244 maxRequestsPerCrawl:
245 this.input.maxPagesPerCrawl === 0
246 ? undefined
247 : this.input.maxPagesPerCrawl,
248 proxyConfiguration: (await Actor.createProxyConfiguration(
249 this.input.proxyConfiguration,
250 )) as any as ProxyConfiguration,
251 launchContext: {
252 launcher: firefox,
253 launchOptions: await launchOptions({
254 ...this.input,
255 humanize: this.input.humanize
256 ? Number(this.input.humanize)
257 : 0,
258 }),
259 } as PlaywrightLaunchContext,
260 useSessionPool: true,
261 persistCookiesPerSession: true,
262 sessionPoolOptions: {
263 persistStateKeyValueStoreId: this.input.sessionPoolName
264 ? SESSION_STORE_NAME
265 : undefined,
266 persistStateKey: this.input.sessionPoolName,
267 sessionOptions: {
268 maxUsageCount: this.maxSessionUsageCount,
269 },
270 },
271 experiments: {
272 requestLocking: true,
273 },
274 };
275
276 this._createNavigationHooks(options);
277
278 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
279 options.sessionPoolOptions!.maxPoolSize = 1;
280 }
281
282 this.crawler = new PlaywrightCrawler(options);
283
284 return this.crawler;
285 }
286
287 private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
288 options.preNavigationHooks!.push(
289 async ({ request, page, session }, gotoOptions) => {
290
291 if (this.input.browserLog) browserTools.dumpConsole(page);
292
293
294 if (
295 this.input.initialCookies &&
296 this.input.initialCookies.length
297 ) {
298 const cookiesToSet = session
299 ? tools.getMissingCookiesFromSession(
300 session,
301 this.input.initialCookies,
302 request.url,
303 )
304 : this.input.initialCookies;
305
306 if (cookiesToSet?.length) {
307
308 session?.setCookies(cookiesToSet, request.url);
309 await page.context().addCookies(cookiesToSet);
310 }
311 }
312
313 if (gotoOptions) {
314 gotoOptions.timeout =
315 (this.devtools
316 ? DEVTOOLS_TIMEOUT_SECS
317 : this.input.pageLoadTimeoutSecs) * 1000;
318 gotoOptions.waitUntil = this.input.waitUntil;
319 }
320 },
321 );
322
323 options.preNavigationHooks!.push(
324 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
325 );
326 options.postNavigationHooks!.push(
327 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
328 );
329 }
330
331 private _runHookWithEnhancedContext(
332 hooks: ((...args: unknown[]) => Awaitable<void>)[],
333 ) {
334 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
335 const { customData } = this.input;
336 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
337 });
338 }
339
340 private async _failedRequestHandler({
341 request,
342 }: PlaywrightCrawlingContext) {
343 const lastError =
344 request.errorMessages[request.errorMessages.length - 1];
345 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
346 log.error(
347 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
348 );
349 return this._handleResult(request, undefined, undefined, true);
350 }
351
352
353
354
355
356
357
358
359
360
361
362 private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
363 const { request, response, crawler } = crawlingContext;
364
365
366
367
368
369
370 tools.ensureMetaData(request);
371
372
373 const aborted = await this._handleMaxResultsPerCrawl(
374 crawler.autoscaledPool,
375 );
376 if (aborted) return;
377
378 const pageFunctionArguments: Dictionary = {};
379
380
381 Object.defineProperties(
382 pageFunctionArguments,
383 Object.getOwnPropertyDescriptors(crawlingContext),
384 );
385
386 pageFunctionArguments.response = {
387 status: response && response.status(),
388 headers: response && response.headers(),
389 };
390
391
392 const contextOptions = {
393 crawlerSetup: {
394 rawInput: this.rawInput,
395 env: this.env,
396 globalStore: this.globalStore,
397 requestQueue: this.requestQueue,
398 keyValueStore: this.keyValueStore,
399 customData: this.input.customData,
400 },
401 pageFunctionArguments,
402 };
403 const { context, state } = createContext(contextOptions);
404
405 if (this.input.closeCookieModals) {
406 await sleep(500);
407 await crawlingContext.page.evaluate(getInjectableScript());
408 await sleep(2000);
409 }
410
411 if (this.input.maxScrollHeightPixels > 0) {
412 await crawlingContext.infiniteScroll({
413 maxScrollHeight: this.input.maxScrollHeightPixels,
414 });
415 }
416
417
418
419
420 const pageFunctionResult = await this.evaledPageFunction(context);
421
422
423
424
425
426
427
428 if (!state.skipLinks) await this._handleLinks(crawlingContext);
429
430
431 await this._handleResult(
432 request,
433 response,
434 pageFunctionResult as Dictionary,
435 );
436 }
437
438 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
439 if (
440 !this.input.maxResultsPerCrawl ||
441 this.pagesOutputted < this.input.maxResultsPerCrawl
442 )
443 return false;
444 if (!autoscaledPool) return false;
445 log.info(
446 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
447 );
448 await autoscaledPool.abort();
449 return true;
450 }
451
452 private async _handleLinks({
453 request,
454 enqueueLinks,
455 }: PlaywrightCrawlingContext) {
456 if (!this.requestQueue) return;
457 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
458 .depth;
459 const hasReachedMaxDepth =
460 this.input.maxCrawlingDepth &&
461 currentDepth >= this.input.maxCrawlingDepth;
462 if (hasReachedMaxDepth) {
463 log.debug(
464 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
465 );
466 return;
467 }
468
469 const enqueueOptions: EnqueueLinksOptions = {
470 globs: this.input.globs,
471 pseudoUrls: this.input.pseudoUrls,
472 exclude: this.input.excludes,
473 transformRequestFunction: (requestOptions) => {
474 requestOptions.userData ??= {};
475 requestOptions.userData[META_KEY] = {
476 parentRequestId: request.id || request.uniqueKey,
477 depth: currentDepth + 1,
478 };
479
480 requestOptions.useExtendedUniqueKey = true;
481 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
482 return requestOptions;
483 },
484 };
485
486 if (this.input.linkSelector) {
487 await enqueueLinks({
488 ...enqueueOptions,
489 selector: this.input.linkSelector,
490 });
491 }
492 }
493
494 private async _handleResult(
495 request: Request,
496 response?: Response,
497 pageFunctionResult?: Dictionary,
498 isError?: boolean,
499 ) {
500 const payload = tools.createDatasetPayload(
501 request,
502 response,
503 pageFunctionResult,
504 isError,
505 );
506 await this.dataset.pushData(payload);
507 this.pagesOutputted++;
508 }
509}