1import { readFile } from 'node:fs/promises';
2import { dirname } from 'node:path';
3import { fileURLToPath, URL } from 'node:url';
4
5import type {
6 AutoscaledPool,
7 EnqueueLinksByClickingElementsOptions,
8 EnqueueLinksOptions,
9 ProxyConfiguration,
10 PuppeteerCrawlerOptions,
11 PuppeteerCrawlingContext,
12 Request,
13} from '@crawlee/puppeteer';
14import {
15 Dataset,
16 KeyValueStore,
17 log,
18 PuppeteerCrawler,
19 RequestList,
20 RequestQueueV2,
21} from '@crawlee/puppeteer';
22import type { Awaitable, Dictionary } from '@crawlee/utils';
23import { sleep } from '@crawlee/utils';
24import type { ApifyEnv } from 'apify';
25import { Actor } from 'apify';
26import { getInjectableScript } from 'idcac-playwright';
27import type { HTTPResponse } from 'puppeteer';
28
29import type {
30 CrawlerSetupOptions,
31 RequestMetadata,
32} from '@apify/scraper-tools';
33import {
34 browserTools,
35 constants as scraperToolsConstants,
36 createContext,
37 tools,
38} from '@apify/scraper-tools';
39
40import type { Input } from './consts.js';
41import { ProxyRotation } from './consts.js';
42
43const SESSION_STORE_NAME = 'APIFY-PUPPETEER-SCRAPER-SESSION-STORE';
44
45const {
46 META_KEY,
47 DEFAULT_VIEWPORT,
48 DEVTOOLS_TIMEOUT_SECS,
49 SESSION_MAX_USAGE_COUNTS,
50} = scraperToolsConstants;
51const SCHEMA = JSON.parse(
52 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
53);
54
55
56
57
58
59export class CrawlerSetup implements CrawlerSetupOptions {
60 name = 'Puppeteer Scraper';
61 rawInput: string;
62 env: ApifyEnv;
63
64
65
66 globalStore = new Map();
67 requestQueue: RequestQueueV2;
68 keyValueStore: KeyValueStore;
69 customData: unknown;
70 input: Input;
71 maxSessionUsageCount: number;
72 evaledPageFunction: (...args: unknown[]) => unknown;
73 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
74 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
75 blockedUrlPatterns: string[] = [];
76 devtools: boolean;
77 datasetName?: string;
78 keyValueStoreName?: string;
79 requestQueueName?: string;
80
81 crawler!: PuppeteerCrawler;
82 requestList!: RequestList;
83 dataset!: Dataset;
84 pagesOutputted!: number;
85 private initPromise: Promise<void>;
86
87 constructor(input: Input) {
88
89 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
90
91
92 this.rawInput = JSON.stringify(input);
93
94
95 tools.maybeLoadPageFunctionFromDisk(
96 input,
97 dirname(fileURLToPath(import.meta.url)),
98 );
99
100
101 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
102
103 this.input = input;
104 this.env = Actor.getEnv();
105
106
107 this.input.pseudoUrls.forEach((purl) => {
108 if (!tools.isPlainObject(purl)) {
109 throw new Error(
110 'The pseudoUrls Array must only contain Objects.',
111 );
112 }
113 if (purl.userData && !tools.isPlainObject(purl.userData)) {
114 throw new Error(
115 'The userData property of a pseudoUrl must be an Object.',
116 );
117 }
118 });
119
120 this.input.initialCookies.forEach((cookie) => {
121 if (!tools.isPlainObject(cookie)) {
122 throw new Error(
123 'The initialCookies Array must only contain Objects.',
124 );
125 }
126 });
127
128 this.input.waitUntil.forEach((event) => {
129 if (
130 !/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(
131 event,
132 )
133 ) {
134 throw new Error(
135 'Navigation wait until events must be valid. See tooltip.',
136 );
137 }
138 });
139
140
141 this.maxSessionUsageCount =
142 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
143
144
145 this.evaledPageFunction = tools.evalFunctionOrThrow(
146 this.input.pageFunction,
147 );
148
149 if (this.input.preNavigationHooks) {
150 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
151 this.input.preNavigationHooks,
152 'preNavigationHooks',
153 );
154 } else {
155 this.evaledPreNavigationHooks = [];
156 }
157
158 if (this.input.postNavigationHooks) {
159 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
160 this.input.postNavigationHooks,
161 'postNavigationHooks',
162 );
163 } else {
164 this.evaledPostNavigationHooks = [];
165 }
166
167
168 this.blockedUrlPatterns = [];
169 if (!this.input.downloadMedia) {
170 this.blockedUrlPatterns = [
171 '.jpg',
172 '.jpeg',
173 '.png',
174 '.svg',
175 '.gif',
176 '.webp',
177 '.webm',
178 '.ico',
179 '.woff',
180 '.eot',
181 ];
182 }
183
184 if (!this.input.downloadCss) {
185 this.blockedUrlPatterns.push('.css');
186 }
187
188
189 this.devtools = this.input.pageFunction.includes('debugger;');
190
191
192 this.datasetName = this.input.datasetName;
193 this.keyValueStoreName = this.input.keyValueStoreName;
194 this.requestQueueName = this.input.requestQueueName;
195
196
197 this.crawler = null!;
198 this.requestList = null!;
199 this.requestQueue = null!;
200 this.dataset = null!;
201 this.keyValueStore = null!;
202 this.initPromise = this._initializeAsync();
203 }
204
205 private async _initializeAsync() {
206
207 const startUrls = this.input.startUrls.map((req) => {
208 req.useExtendedUniqueKey = true;
209 req.keepUrlFragment = this.input.keepUrlFragments;
210 return req;
211 });
212
213 this.requestList = await RequestList.open(
214 'PUPPETEER_SCRAPER',
215 startUrls,
216 );
217
218
219 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
220
221
222 this.dataset = await Dataset.open(this.datasetName);
223 const info = await this.dataset.getInfo();
224 this.pagesOutputted = info?.itemCount ?? 0;
225
226
227 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
228 }
229
230
231
232
233 async createCrawler() {
234 await this.initPromise;
235
236 const args = [];
237 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
238
239 const options: PuppeteerCrawlerOptions = {
240 requestHandler: this._requestHandler.bind(this),
241 requestList: this.requestList,
242 requestQueue: this.requestQueue,
243 requestHandlerTimeoutSecs: this.devtools
244 ? DEVTOOLS_TIMEOUT_SECS
245 : this.input.pageFunctionTimeoutSecs,
246 preNavigationHooks: [],
247 postNavigationHooks: [],
248 failedRequestHandler: this._failedRequestHandler.bind(this),
249 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
250 maxConcurrency: this.input.maxConcurrency,
251 maxRequestRetries: this.input.maxRequestRetries,
252 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
253 proxyConfiguration: (await Actor.createProxyConfiguration(
254 this.input.proxyConfiguration,
255 )) as any as ProxyConfiguration,
256 launchContext: {
257 useChrome: this.input.useChrome,
258 launchOptions: {
259 acceptInsecureCerts: this.input.ignoreSslErrors,
260 defaultViewport: DEFAULT_VIEWPORT,
261 devtools: this.devtools,
262 args,
263 headless: this.input.headless,
264 },
265 },
266 useSessionPool: true,
267 persistCookiesPerSession: true,
268 sessionPoolOptions: {
269 persistStateKeyValueStoreId: this.input.sessionPoolName
270 ? SESSION_STORE_NAME
271 : undefined,
272 persistStateKey: this.input.sessionPoolName,
273 sessionOptions: {
274 maxUsageCount: this.maxSessionUsageCount,
275 },
276 },
277 experiments: {
278 requestLocking: true,
279 },
280 };
281
282 this._createNavigationHooks(options);
283
284 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
285 options.sessionPoolOptions!.maxPoolSize = 1;
286 }
287
288 this.crawler = new PuppeteerCrawler(options);
289
290 return this.crawler;
291 }
292
293 private _createNavigationHooks(options: PuppeteerCrawlerOptions) {
294 options.preNavigationHooks!.push(
295 async ({ request, page, session, blockRequests }, gotoOptions) => {
296
297 if (this.input.browserLog) browserTools.dumpConsole(page);
298
299
300 if (this.blockedUrlPatterns.length) {
301 await blockRequests({
302 urlPatterns: this.blockedUrlPatterns,
303 });
304 }
305
306
307 if (
308 this.input.initialCookies &&
309 this.input.initialCookies.length
310 ) {
311 const cookiesToSet = session
312 ? tools.getMissingCookiesFromSession(
313 session,
314 this.input.initialCookies,
315 request.url,
316 )
317 : this.input.initialCookies;
318
319 if (cookiesToSet?.length) {
320
321 session?.setCookies(cookiesToSet, request.url);
322 await page.setCookie(...cookiesToSet);
323 }
324 }
325
326
327 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
328
329 if (gotoOptions) {
330 gotoOptions.timeout =
331 (this.devtools
332 ? DEVTOOLS_TIMEOUT_SECS
333 : this.input.pageLoadTimeoutSecs) * 1000;
334 gotoOptions.waitUntil = this.input.waitUntil;
335 }
336 },
337 );
338
339 options.preNavigationHooks!.push(
340 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
341 );
342 options.postNavigationHooks!.push(
343 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
344 );
345 }
346
347 private _runHookWithEnhancedContext(
348 hooks: ((...args: unknown[]) => Awaitable<void>)[],
349 ) {
350 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
351 const { customData } = this.input;
352 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
353 });
354 }
355
356 private async _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
357 const lastError =
358 request.errorMessages[request.errorMessages.length - 1];
359 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
360 log.error(
361 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
362 );
363 return this._handleResult(request, undefined, undefined, true);
364 }
365
366
367
368
369
370
371
372
373
374
375
376 private async _requestHandler(crawlingContext: PuppeteerCrawlingContext) {
377 const { request, response, crawler } = crawlingContext;
378
379
380
381
382
383
384 tools.ensureMetaData(request);
385
386
387 const aborted = await this._handleMaxResultsPerCrawl(
388 crawler.autoscaledPool,
389 );
390 if (aborted) return;
391
392 const pageFunctionArguments: Dictionary = {};
393
394
395 Object.defineProperties(
396 pageFunctionArguments,
397 Object.getOwnPropertyDescriptors(crawlingContext),
398 );
399
400 pageFunctionArguments.response = {
401 status: response && response.status(),
402 headers: response && response.headers(),
403 };
404
405
406 const contextOptions = {
407 crawlerSetup: {
408 rawInput: this.rawInput,
409 env: this.env,
410 globalStore: this.globalStore,
411 requestQueue: this.requestQueue,
412 keyValueStore: this.keyValueStore,
413 customData: this.input.customData,
414 },
415 pageFunctionArguments,
416 };
417 const { context, state } = createContext(contextOptions);
418
419 if (this.input.closeCookieModals) {
420 await sleep(500);
421 await crawlingContext.page.evaluate(getInjectableScript());
422 await sleep(2000);
423 }
424
425 if (this.input.maxScrollHeightPixels > 0) {
426 await crawlingContext.infiniteScroll({
427 maxScrollHeight: this.input.maxScrollHeightPixels,
428 });
429 }
430
431
432
433
434 const pageFunctionResult = await this.evaledPageFunction(context);
435
436
437
438
439
440
441
442 if (!state.skipLinks) await this._handleLinks(crawlingContext);
443
444
445 await this._handleResult(
446 request,
447 response,
448 pageFunctionResult as Dictionary,
449 );
450 }
451
452 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
453 if (
454 !this.input.maxResultsPerCrawl ||
455 this.pagesOutputted < this.input.maxResultsPerCrawl
456 )
457 return false;
458 if (!autoscaledPool) return false;
459 log.info(
460 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
461 );
462 await autoscaledPool.abort();
463 return true;
464 }
465
466 private async _handleLinks({
467 request,
468 enqueueLinks,
469 enqueueLinksByClickingElements,
470 }: PuppeteerCrawlingContext) {
471 if (!this.requestQueue) return;
472 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
473 .depth;
474 const hasReachedMaxDepth =
475 this.input.maxCrawlingDepth &&
476 currentDepth >= this.input.maxCrawlingDepth;
477 if (hasReachedMaxDepth) {
478 log.debug(
479 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
480 );
481 return;
482 }
483
484 const enqueueOptions: EnqueueLinksOptions = {
485 globs: this.input.globs,
486 pseudoUrls: this.input.pseudoUrls,
487 exclude: this.input.excludes,
488 transformRequestFunction: (requestOptions) => {
489 requestOptions.userData ??= {};
490 requestOptions.userData[META_KEY] = {
491 parentRequestId: request.id || request.uniqueKey,
492 depth: currentDepth + 1,
493 };
494
495 requestOptions.useExtendedUniqueKey = true;
496 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
497 return requestOptions;
498 },
499 };
500
501 if (this.input.linkSelector) {
502 await enqueueLinks({
503 ...enqueueOptions,
504 selector: this.input.linkSelector,
505 });
506 }
507
508 if (this.input.clickableElementsSelector) {
509 await enqueueLinksByClickingElements({
510 ...enqueueOptions,
511 selector: this.input.clickableElementsSelector,
512 } as EnqueueLinksByClickingElementsOptions);
513 }
514 }
515
516 private async _handleResult(
517 request: Request,
518 response?: HTTPResponse,
519 pageFunctionResult?: Dictionary,
520 isError?: boolean,
521 ) {
522 const payload = tools.createDatasetPayload(
523 request,
524 response,
525 pageFunctionResult,
526 isError,
527 );
528 await this.dataset.pushData(payload);
529 this.pagesOutputted++;
530 }
531}