1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksByClickingElementsOptions,
8 EnqueueLinksOptions,
9 ProxyConfiguration,
10 PuppeteerCrawlerOptions,
11 PuppeteerCrawlingContext,
12 Request,
13} from '@crawlee/puppeteer';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PuppeteerCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/puppeteer';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { HTTPResponse } from 'puppeteer';
28
29import type {
30 CrawlerSetupOptions,
31 RequestMetadata,
32} from '@apify/scraper-tools';
33import {
34 browserTools,
35 constants as scraperToolsConstants,
36 createContext,
37 tools,
38} from '@apify/scraper-tools';
39
40import type { Input } from './consts.js';
41import { ProxyRotation } from './consts.js';
42
43const SESSION_STORE_NAME = 'APIFY-PUPPETEER-SCRAPER-SESSION-STORE';
44const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
45
46const {
47 META_KEY,
48 DEFAULT_VIEWPORT,
49 DEVTOOLS_TIMEOUT_SECS,
50 SESSION_MAX_USAGE_COUNTS,
51} = scraperToolsConstants;
52const SCHEMA = JSON.parse(
53 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
54);
55
56
57
58
59
60export class CrawlerSetup implements CrawlerSetupOptions {
61 name = 'Puppeteer Scraper';
62 rawInput: string;
63 env: ApifyEnv;
64
65
66
67 globalStore = new Map();
68 requestQueue: RequestQueueV2;
69 keyValueStore: KeyValueStore;
70 customData: unknown;
71 input: Input;
72 maxSessionUsageCount: number;
73 evaledPageFunction: (...args: unknown[]) => unknown;
74 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
75 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
76 blockedUrlPatterns: string[] = [];
77 devtools: boolean;
78 datasetName?: string;
79 keyValueStoreName?: string;
80 requestQueueName?: string;
81
82 crawler!: PuppeteerCrawler;
83 dataset!: Dataset;
84 pagesOutputted!: number;
85 private initPromise: Promise<void>;
86
87 constructor(input: Input) {
88
89 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
90
91
92 this.rawInput = JSON.stringify(input);
93
94
95 tools.maybeLoadPageFunctionFromDisk(
96 input,
97 dirname(fileURLToPath(import.meta.url)),
98 );
99
100
101 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
102
103 this.input = input;
104 this.env = Actor.getEnv();
105
106
107 this.input.pseudoUrls.forEach((purl) => {
108 if (!tools.isPlainObject(purl)) {
109 throw new Error(
110 'The pseudoUrls Array must only contain Objects.',
111 );
112 }
113 if (purl.userData && !tools.isPlainObject(purl.userData)) {
114 throw new Error(
115 'The userData property of a pseudoUrl must be an Object.',
116 );
117 }
118 });
119
120 this.input.initialCookies.forEach((cookie) => {
121 if (!tools.isPlainObject(cookie)) {
122 throw new Error(
123 'The initialCookies Array must only contain Objects.',
124 );
125 }
126 });
127
128 this.input.waitUntil.forEach((event) => {
129 if (
130 !/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(
131 event,
132 )
133 ) {
134 throw new Error(
135 'Navigation wait until events must be valid. See tooltip.',
136 );
137 }
138 });
139
140
141 this.maxSessionUsageCount =
142 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
143
144
145 this.evaledPageFunction = tools.evalFunctionOrThrow(
146 this.input.pageFunction,
147 );
148
149 if (this.input.preNavigationHooks) {
150 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
151 this.input.preNavigationHooks,
152 'preNavigationHooks',
153 );
154 } else {
155 this.evaledPreNavigationHooks = [];
156 }
157
158 if (this.input.postNavigationHooks) {
159 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
160 this.input.postNavigationHooks,
161 'postNavigationHooks',
162 );
163 } else {
164 this.evaledPostNavigationHooks = [];
165 }
166
167
168 this.blockedUrlPatterns = [];
169 if (!this.input.downloadMedia) {
170 this.blockedUrlPatterns = [
171 '.jpg',
172 '.jpeg',
173 '.png',
174 '.svg',
175 '.gif',
176 '.webp',
177 '.webm',
178 '.ico',
179 '.woff',
180 '.eot',
181 ];
182 }
183
184 if (!this.input.downloadCss) {
185 this.blockedUrlPatterns.push('.css');
186 }
187
188
189 this.devtools = this.input.pageFunction.includes('debugger;');
190
191
192 this.datasetName = this.input.datasetName;
193 this.keyValueStoreName = this.input.keyValueStoreName;
194 this.requestQueueName = this.input.requestQueueName;
195
196
197 this.crawler = null!;
198 this.requestQueue = null!;
199 this.dataset = null!;
200 this.keyValueStore = null!;
201 this.initPromise = this._initializeAsync();
202 }
203
204 private async _initializeAsync() {
205
206 const startUrls = this.input.startUrls.map((req) => {
207 req.useExtendedUniqueKey = true;
208 req.keepUrlFragment = this.input.keepUrlFragments;
209 return req;
210 });
211
212
213 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
214
215
216 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
217
218 if (
219 !(await this.keyValueStore.recordExists(
220 REQUEST_QUEUE_INIT_FLAG_KEY,
221 ))
222 ) {
223 const requests: Request[] = [];
224 for await (const request of await RequestList.open(
225 null,
226 startUrls,
227 )) {
228 if (
229 this.input.maxResultsPerCrawl > 0 &&
230 requests.length >= 1.5 * this.input.maxResultsPerCrawl
231 ) {
232 break;
233 }
234 requests.push(request);
235 }
236
237 const { waitForAllRequestsToBeAdded } =
238 await this.requestQueue.addRequestsBatched(requests);
239
240 void waitForAllRequestsToBeAdded.then(async () => {
241 await this.keyValueStore.setValue(
242 REQUEST_QUEUE_INIT_FLAG_KEY,
243 '1',
244 );
245 });
246 }
247
248
249 this.dataset = await Dataset.open(this.datasetName);
250 const info = await this.dataset.getInfo();
251 this.pagesOutputted = info?.itemCount ?? 0;
252 }
253
254
255
256
257 async createCrawler() {
258 await this.initPromise;
259
260 const args = [];
261 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
262
263 const options: PuppeteerCrawlerOptions = {
264 requestHandler: this._requestHandler.bind(this),
265 requestQueue: this.requestQueue,
266 requestHandlerTimeoutSecs: this.devtools
267 ? DEVTOOLS_TIMEOUT_SECS
268 : this.input.pageFunctionTimeoutSecs,
269 preNavigationHooks: [],
270 postNavigationHooks: [],
271 failedRequestHandler: this._failedRequestHandler.bind(this),
272 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
273 maxConcurrency: this.input.maxConcurrency,
274 maxRequestRetries: this.input.maxRequestRetries,
275 maxRequestsPerCrawl:
276 this.input.maxPagesPerCrawl === 0
277 ? undefined
278 : this.input.maxPagesPerCrawl,
279 proxyConfiguration: (await Actor.createProxyConfiguration(
280 this.input.proxyConfiguration,
281 )) as any as ProxyConfiguration,
282 launchContext: {
283 useChrome: this.input.useChrome,
284 launchOptions: {
285 acceptInsecureCerts: this.input.ignoreSslErrors,
286 defaultViewport: DEFAULT_VIEWPORT,
287 devtools: this.devtools,
288 args,
289 headless: this.input.headless,
290 },
291 },
292 useSessionPool: true,
293 persistCookiesPerSession: true,
294 sessionPoolOptions: {
295 persistStateKeyValueStoreId: this.input.sessionPoolName
296 ? SESSION_STORE_NAME
297 : undefined,
298 persistStateKey: this.input.sessionPoolName,
299 sessionOptions: {
300 maxUsageCount: this.maxSessionUsageCount,
301 },
302 },
303 experiments: {
304 requestLocking: true,
305 },
306 };
307
308 this._createNavigationHooks(options);
309
310 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
311 options.sessionPoolOptions!.maxPoolSize = 1;
312 }
313
314 this.crawler = new PuppeteerCrawler(options);
315
316 return this.crawler;
317 }
318
319 private _createNavigationHooks(options: PuppeteerCrawlerOptions) {
320 options.preNavigationHooks!.push(
321 async ({ request, page, session, blockRequests }, gotoOptions) => {
322
323 if (this.input.browserLog) browserTools.dumpConsole(page);
324
325
326 if (this.blockedUrlPatterns.length) {
327 await blockRequests({
328 urlPatterns: this.blockedUrlPatterns,
329 });
330 }
331
332
333 if (
334 this.input.initialCookies &&
335 this.input.initialCookies.length
336 ) {
337 const cookiesToSet = session
338 ? tools.getMissingCookiesFromSession(
339 session,
340 this.input.initialCookies,
341 request.url,
342 )
343 : this.input.initialCookies;
344
345 if (cookiesToSet?.length) {
346
347 session?.setCookies(cookiesToSet, request.url);
348 await page.setCookie(...cookiesToSet);
349 }
350 }
351
352
353 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
354
355 if (gotoOptions) {
356 gotoOptions.timeout =
357 (this.devtools
358 ? DEVTOOLS_TIMEOUT_SECS
359 : this.input.pageLoadTimeoutSecs) * 1000;
360 gotoOptions.waitUntil = this.input.waitUntil;
361 }
362 },
363 );
364
365 options.preNavigationHooks!.push(
366 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
367 );
368 options.postNavigationHooks!.push(
369 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
370 );
371 }
372
373 private _runHookWithEnhancedContext(
374 hooks: ((...args: unknown[]) => Awaitable<void>)[],
375 ) {
376 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
377 const { customData } = this.input;
378 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
379 });
380 }
381
382 private async _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
383 const lastError =
384 request.errorMessages[request.errorMessages.length - 1];
385 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
386 log.error(
387 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
388 );
389 return this._handleResult(request, undefined, undefined, true);
390 }
391
392
393
394
395
396
397
398
399
400
401
402 private async _requestHandler(crawlingContext: PuppeteerCrawlingContext) {
403 const { request, response, crawler } = crawlingContext;
404
405
406
407
408
409
410 tools.ensureMetaData(request);
411
412
413 const aborted = await this._handleMaxResultsPerCrawl(
414 crawler.autoscaledPool,
415 );
416 if (aborted) return;
417
418 const pageFunctionArguments: Dictionary = {};
419
420
421 Object.defineProperties(
422 pageFunctionArguments,
423 Object.getOwnPropertyDescriptors(crawlingContext),
424 );
425
426 pageFunctionArguments.response = {
427 status: response && response.status(),
428 headers: response && response.headers(),
429 };
430
431
432 const contextOptions = {
433 crawlerSetup: {
434 rawInput: this.rawInput,
435 env: this.env,
436 globalStore: this.globalStore,
437 requestQueue: this.requestQueue,
438 keyValueStore: this.keyValueStore,
439 customData: this.input.customData,
440 },
441 pageFunctionArguments,
442 };
443 const { context, state } = createContext(contextOptions);
444
445 if (this.input.closeCookieModals) {
446 await sleep(500);
447 await crawlingContext.page.evaluate(getInjectableScript());
448 await sleep(2000);
449 }
450
451 if (this.input.maxScrollHeightPixels > 0) {
452 await crawlingContext.infiniteScroll({
453 maxScrollHeight: this.input.maxScrollHeightPixels,
454 });
455 }
456
457
458
459
460 const pageFunctionResult = await this.evaledPageFunction(context);
461
462
463
464
465
466
467
468 if (!state.skipLinks) await this._handleLinks(crawlingContext);
469
470
471 await this._handleResult(
472 request,
473 response,
474 pageFunctionResult as Dictionary,
475 );
476 }
477
478 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
479 if (
480 !this.input.maxResultsPerCrawl ||
481 this.pagesOutputted < this.input.maxResultsPerCrawl
482 )
483 return false;
484 if (!autoscaledPool) return false;
485 log.info(
486 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
487 );
488 await autoscaledPool.abort();
489 return true;
490 }
491
492 private async _handleLinks({
493 request,
494 enqueueLinks,
495 enqueueLinksByClickingElements,
496 }: PuppeteerCrawlingContext) {
497 if (!this.requestQueue) return;
498 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
499 .depth;
500 const hasReachedMaxDepth =
501 this.input.maxCrawlingDepth &&
502 currentDepth >= this.input.maxCrawlingDepth;
503 if (hasReachedMaxDepth) {
504 log.debug(
505 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
506 );
507 return;
508 }
509
510 const enqueueOptions: EnqueueLinksOptions = {
511 globs: this.input.globs,
512 pseudoUrls: this.input.pseudoUrls,
513 exclude: this.input.excludes,
514 transformRequestFunction: (requestOptions) => {
515 requestOptions.userData ??= {};
516 requestOptions.userData[META_KEY] = {
517 parentRequestId: request.id || request.uniqueKey,
518 depth: currentDepth + 1,
519 };
520
521 requestOptions.useExtendedUniqueKey = true;
522 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
523 return requestOptions;
524 },
525 };
526
527 if (this.input.linkSelector) {
528 await enqueueLinks({
529 ...enqueueOptions,
530 selector: this.input.linkSelector,
531 });
532 }
533
534 if (this.input.clickableElementsSelector) {
535 await enqueueLinksByClickingElements({
536 ...enqueueOptions,
537 selector: this.input.clickableElementsSelector,
538 } as EnqueueLinksByClickingElementsOptions);
539 }
540 }
541
542 private async _handleResult(
543 request: Request,
544 response?: HTTPResponse,
545 pageFunctionResult?: Dictionary,
546 isError?: boolean,
547 ) {
548 const payload = tools.createDatasetPayload(
549 request,
550 response,
551 pageFunctionResult,
552 isError,
553 );
554 await this.dataset.pushData(payload);
555 this.pagesOutputted++;
556 }
557}