1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksOptions,
8 PlaywrightCrawlerOptions,
9 PlaywrightCrawlingContext,
10 PlaywrightLaunchContext,
11 ProxyConfiguration,
12 Request,
13} from '@crawlee/playwright';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PlaywrightCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/playwright';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { launchOptions } from 'camoufox-js';
27import { getInjectableScript } from 'idcac-playwright';
28import type { Response } from 'playwright';
29import { firefox } from 'playwright';
30
31import type {
32 CrawlerSetupOptions,
33 RequestMetadata,
34} from '@apify/scraper-tools';
35import {
36 browserTools,
37 constants as scraperToolsConstants,
38 createContext,
39 tools,
40} from '@apify/scraper-tools';
41
42import type { Input } from './consts.js';
43import { ProxyRotation } from './consts.js';
44
45const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
46
47const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
48 scraperToolsConstants;
49const SCHEMA = JSON.parse(
50 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
51);
52
53
54
55
56
57export class CrawlerSetup implements CrawlerSetupOptions {
58 name = 'Playwright Scraper';
59 rawInput: string;
60 env: ApifyEnv;
61
62
63
64 globalStore = new Map();
65 requestQueue: RequestQueueV2;
66 keyValueStore: KeyValueStore;
67 customData: unknown;
68 input: Input;
69 maxSessionUsageCount: number;
70 evaledPageFunction: (...args: unknown[]) => unknown;
71 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
72 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
73 devtools: boolean;
74 datasetName?: string;
75 keyValueStoreName?: string;
76 requestQueueName?: string;
77
78 crawler!: PlaywrightCrawler;
79 requestList!: RequestList;
80 dataset!: Dataset;
81 pagesOutputted!: number;
82 private initPromise: Promise<void>;
83
84 constructor(input: Input) {
85
86 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
87
88
89 this.rawInput = JSON.stringify(input);
90
91
92 tools.maybeLoadPageFunctionFromDisk(
93 input,
94 dirname(fileURLToPath(import.meta.url)),
95 );
96
97
98 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
99
100 this.input = input;
101 this.env = Actor.getEnv();
102
103
104 this.input.pseudoUrls.forEach((purl) => {
105 if (!tools.isPlainObject(purl)) {
106 throw new Error(
107 'The pseudoUrls Array must only contain Objects.',
108 );
109 }
110 if (purl.userData && !tools.isPlainObject(purl.userData)) {
111 throw new Error(
112 'The userData property of a pseudoUrl must be an Object.',
113 );
114 }
115 });
116
117 this.input.initialCookies.forEach((cookie) => {
118 if (!tools.isPlainObject(cookie)) {
119 throw new Error(
120 'The initialCookies Array must only contain Objects.',
121 );
122 }
123 });
124
125 if (
126 !/^(domcontentloaded|load|networkidle)$/.test(this.input.waitUntil)
127 ) {
128 throw new Error(
129 'Navigation wait until event must be valid. See tooltip.',
130 );
131 }
132
133
134 this.maxSessionUsageCount =
135 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
136
137
138 this.evaledPageFunction = tools.evalFunctionOrThrow(
139 this.input.pageFunction,
140 );
141
142 if (this.input.preNavigationHooks) {
143 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
144 this.input.preNavigationHooks,
145 'preNavigationHooks',
146 );
147 } else {
148 this.evaledPreNavigationHooks = [];
149 }
150
151 if (this.input.postNavigationHooks) {
152 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
153 this.input.postNavigationHooks,
154 'postNavigationHooks',
155 );
156 } else {
157 this.evaledPostNavigationHooks = [];
158 }
159
160
161 this.devtools = this.input.pageFunction.includes('debugger;');
162
163
164 this.datasetName = this.input.datasetName;
165 this.keyValueStoreName = this.input.keyValueStoreName;
166 this.requestQueueName = this.input.requestQueueName;
167
168
169 this.crawler = null!;
170 this.requestList = null!;
171 this.requestQueue = null!;
172 this.dataset = null!;
173 this.keyValueStore = null!;
174 this.initPromise = this._initializeAsync();
175 }
176
177 private async _initializeAsync() {
178
179 const startUrls = this.input.startUrls.map((req) => {
180 req.useExtendedUniqueKey = true;
181 req.keepUrlFragment = this.input.keepUrlFragments;
182 return req;
183 });
184
185 this.requestList = await RequestList.open(
186 'PLAYWRIGHT_SCRAPER',
187 startUrls,
188 );
189
190
191 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
192
193
194 this.dataset = await Dataset.open(this.datasetName);
195 const info = await this.dataset.getInfo();
196 this.pagesOutputted = info?.itemCount ?? 0;
197
198
199 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
200 }
201
202
203
204
205 async createCrawler() {
206 await this.initPromise;
207
208 const options: PlaywrightCrawlerOptions = {
209 requestHandler: this._requestHandler.bind(this),
210 requestList: this.requestList,
211 requestQueue: this.requestQueue,
212 requestHandlerTimeoutSecs: this.devtools
213 ? DEVTOOLS_TIMEOUT_SECS
214 : this.input.pageFunctionTimeoutSecs,
215 preNavigationHooks: [],
216 postNavigationHooks: [],
217 failedRequestHandler: this._failedRequestHandler.bind(this),
218 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
219 maxConcurrency: this.input.maxConcurrency,
220 maxRequestRetries: this.input.maxRequestRetries,
221 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
222 proxyConfiguration: (await Actor.createProxyConfiguration(
223 this.input.proxyConfiguration,
224 )) as any as ProxyConfiguration,
225 launchContext: {
226 launcher: firefox,
227 launchOptions: await launchOptions({
228 ...this.input,
229 humanize: this.input.humanize
230 ? Number(this.input.humanize)
231 : 0,
232 }),
233 } as PlaywrightLaunchContext,
234 useSessionPool: true,
235 persistCookiesPerSession: true,
236 sessionPoolOptions: {
237 persistStateKeyValueStoreId: this.input.sessionPoolName
238 ? SESSION_STORE_NAME
239 : undefined,
240 persistStateKey: this.input.sessionPoolName,
241 sessionOptions: {
242 maxUsageCount: this.maxSessionUsageCount,
243 },
244 },
245 experiments: {
246 requestLocking: true,
247 },
248 };
249
250 this._createNavigationHooks(options);
251
252 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
253 options.sessionPoolOptions!.maxPoolSize = 1;
254 }
255
256 this.crawler = new PlaywrightCrawler(options);
257
258 return this.crawler;
259 }
260
261 private _createNavigationHooks(options: PlaywrightCrawlerOptions) {
262 options.preNavigationHooks!.push(
263 async ({ request, page, session }, gotoOptions) => {
264
265 if (this.input.browserLog) browserTools.dumpConsole(page);
266
267
268 if (
269 this.input.initialCookies &&
270 this.input.initialCookies.length
271 ) {
272 const cookiesToSet = session
273 ? tools.getMissingCookiesFromSession(
274 session,
275 this.input.initialCookies,
276 request.url,
277 )
278 : this.input.initialCookies;
279
280 if (cookiesToSet?.length) {
281
282 session?.setCookies(cookiesToSet, request.url);
283 await page.context().addCookies(cookiesToSet);
284 }
285 }
286
287 if (gotoOptions) {
288 gotoOptions.timeout =
289 (this.devtools
290 ? DEVTOOLS_TIMEOUT_SECS
291 : this.input.pageLoadTimeoutSecs) * 1000;
292 gotoOptions.waitUntil = this.input.waitUntil;
293 }
294 },
295 );
296
297 options.preNavigationHooks!.push(
298 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
299 );
300 options.postNavigationHooks!.push(
301 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
302 );
303 }
304
305 private _runHookWithEnhancedContext(
306 hooks: ((...args: unknown[]) => Awaitable<void>)[],
307 ) {
308 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
309 const { customData } = this.input;
310 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
311 });
312 }
313
314 private async _failedRequestHandler({
315 request,
316 }: PlaywrightCrawlingContext) {
317 const lastError =
318 request.errorMessages[request.errorMessages.length - 1];
319 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
320 log.error(
321 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
322 );
323 return this._handleResult(request, undefined, undefined, true);
324 }
325
326
327
328
329
330
331
332
333
334
335
336 private async _requestHandler(crawlingContext: PlaywrightCrawlingContext) {
337 const { request, response, crawler } = crawlingContext;
338
339
340
341
342
343
344 tools.ensureMetaData(request);
345
346
347 const aborted = await this._handleMaxResultsPerCrawl(
348 crawler.autoscaledPool,
349 );
350 if (aborted) return;
351
352 const pageFunctionArguments: Dictionary = {};
353
354
355 Object.defineProperties(
356 pageFunctionArguments,
357 Object.getOwnPropertyDescriptors(crawlingContext),
358 );
359
360 pageFunctionArguments.response = {
361 status: response && response.status(),
362 headers: response && response.headers(),
363 };
364
365
366 const contextOptions = {
367 crawlerSetup: {
368 rawInput: this.rawInput,
369 env: this.env,
370 globalStore: this.globalStore,
371 requestQueue: this.requestQueue,
372 keyValueStore: this.keyValueStore,
373 customData: this.input.customData,
374 },
375 pageFunctionArguments,
376 };
377 const { context, state } = createContext(contextOptions);
378
379 if (this.input.closeCookieModals) {
380 await sleep(500);
381 await crawlingContext.page.evaluate(getInjectableScript());
382 await sleep(2000);
383 }
384
385 if (this.input.maxScrollHeightPixels > 0) {
386 await crawlingContext.infiniteScroll({
387 maxScrollHeight: this.input.maxScrollHeightPixels,
388 });
389 }
390
391
392
393
394 const pageFunctionResult = await this.evaledPageFunction(context);
395
396
397
398
399
400
401
402 if (!state.skipLinks) await this._handleLinks(crawlingContext);
403
404
405 await this._handleResult(
406 request,
407 response,
408 pageFunctionResult as Dictionary,
409 );
410 }
411
412 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
413 if (
414 !this.input.maxResultsPerCrawl ||
415 this.pagesOutputted < this.input.maxResultsPerCrawl
416 )
417 return false;
418 if (!autoscaledPool) return false;
419 log.info(
420 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
421 );
422 await autoscaledPool.abort();
423 return true;
424 }
425
426 private async _handleLinks({
427 request,
428 enqueueLinks,
429 }: PlaywrightCrawlingContext) {
430 if (!this.requestQueue) return;
431 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
432 .depth;
433 const hasReachedMaxDepth =
434 this.input.maxCrawlingDepth &&
435 currentDepth >= this.input.maxCrawlingDepth;
436 if (hasReachedMaxDepth) {
437 log.debug(
438 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
439 );
440 return;
441 }
442
443 const enqueueOptions: EnqueueLinksOptions = {
444 globs: this.input.globs,
445 pseudoUrls: this.input.pseudoUrls,
446 exclude: this.input.excludes,
447 transformRequestFunction: (requestOptions) => {
448 requestOptions.userData ??= {};
449 requestOptions.userData[META_KEY] = {
450 parentRequestId: request.id || request.uniqueKey,
451 depth: currentDepth + 1,
452 };
453
454 requestOptions.useExtendedUniqueKey = true;
455 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
456 return requestOptions;
457 },
458 };
459
460 if (this.input.linkSelector) {
461 await enqueueLinks({
462 ...enqueueOptions,
463 selector: this.input.linkSelector,
464 });
465 }
466 }
467
468 private async _handleResult(
469 request: Request,
470 response?: Response,
471 pageFunctionResult?: Dictionary,
472 isError?: boolean,
473 ) {
474 const payload = tools.createDatasetPayload(
475 request,
476 response,
477 pageFunctionResult,
478 isError,
479 );
480 await this.dataset.pushData(payload);
481 this.pagesOutputted++;
482 }
483}