1import { readFile } from 'node:fs/promises';
2import type { IncomingMessage } from 'node:http';
3import { dirname } from 'node:path';
4import { fileURLToPath, URL } from 'node:url';
5
6import type {
7    AutoscaledPool,
8    Awaitable,
9    CheerioCrawlerOptions,
10    CheerioCrawlingContext,
11    Dictionary,
12    ProxyConfiguration,
13    Request,
14} from '@crawlee/cheerio';
15import {
16    CheerioCrawler,
17    Dataset,
18    KeyValueStore,
19    log,
20    RequestList,
21    RequestQueueV2,
22} from '@crawlee/cheerio';
23import type { ApifyEnv } from 'apify';
24import { Actor } from 'apify';
25import { load } from 'cheerio';
26
27import type {
28    CrawlerSetupOptions,
29    RequestMetadata,
30} from '@apify/scraper-tools';
31import {
32    constants as scraperToolsConstants,
33    createContext,
34    tools,
35} from '@apify/scraper-tools';
36
37import type { Input } from './consts.js';
38import { ProxyRotation } from './consts.js';
39
40const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
41const SCHEMA = JSON.parse(
42    await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
43);
44
45const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
46const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
47const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
48
49
50
51
52
53export class CrawlerSetup implements CrawlerSetupOptions {
54    name = 'Cheerio Scraper';
55    rawInput: string;
56    env: ApifyEnv;
57    
58
59
60    globalStore = new Map();
61    requestQueue: RequestQueueV2;
62    keyValueStore: KeyValueStore;
63    customData: unknown;
64    input: Input;
65    maxSessionUsageCount: number;
66    evaledPageFunction: (...args: unknown[]) => unknown;
67    evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
68    evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
69    datasetName?: string;
70    keyValueStoreName?: string;
71    requestQueueName?: string;
72
73    crawler!: CheerioCrawler;
74    dataset!: Dataset;
75    pagesOutputted!: number;
76    proxyConfiguration?: ProxyConfiguration;
77    private initPromise: Promise<void>;
78
79    constructor(input: Input) {
80        
81        if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
82
83        
84        this.rawInput = JSON.stringify(input);
85
86        
87        tools.maybeLoadPageFunctionFromDisk(
88            input,
89            dirname(fileURLToPath(import.meta.url)),
90        );
91
92        
93        if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
94
95        this.input = input;
96        this.env = Actor.getEnv();
97
98        
99        this.input.pseudoUrls.forEach((purl) => {
100            if (!tools.isPlainObject(purl)) {
101                throw new Error(
102                    'The pseudoUrls Array must only contain Objects.',
103                );
104            }
105            if (purl.userData && !tools.isPlainObject(purl.userData)) {
106                throw new Error(
107                    'The userData property of a pseudoUrl must be an Object.',
108                );
109            }
110        });
111
112        this.input.initialCookies.forEach((cookie) => {
113            if (!tools.isPlainObject(cookie)) {
114                throw new Error(
115                    'The initialCookies Array must only contain Objects.',
116                );
117            }
118        });
119
120        
121        this.maxSessionUsageCount =
122            SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
123
124        
125        this.evaledPageFunction = tools.evalFunctionOrThrow(
126            this.input.pageFunction,
127        );
128
129        if (this.input.preNavigationHooks) {
130            this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
131                this.input.preNavigationHooks,
132                'preNavigationHooks',
133            );
134        } else {
135            this.evaledPreNavigationHooks = [];
136        }
137
138        if (this.input.postNavigationHooks) {
139            this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
140                this.input.postNavigationHooks,
141                'postNavigationHooks',
142            );
143        } else {
144            this.evaledPostNavigationHooks = [];
145        }
146
147        
148        this.datasetName = this.input.datasetName;
149        this.keyValueStoreName = this.input.keyValueStoreName;
150        this.requestQueueName = this.input.requestQueueName;
151
152        
153        this.crawler = null!;
154        this.requestQueue = null!;
155        this.dataset = null!;
156        this.keyValueStore = null!;
157        this.proxyConfiguration = null!;
158        this.initPromise = this._initializeAsync();
159    }
160
161    private async _initializeAsync() {
162        
163        const startUrls = this.input.startUrls.map((req) => {
164            req.useExtendedUniqueKey = true;
165            req.keepUrlFragment = this.input.keepUrlFragments;
166            return req;
167        });
168
169        
170        this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
171
172        
173        this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
174
175        if (
176            !(await this.keyValueStore.recordExists(
177                REQUEST_QUEUE_INIT_FLAG_KEY,
178            ))
179        ) {
180            const requests: Request[] = [];
181            for await (const request of await RequestList.open(
182                null,
183                startUrls,
184            )) {
185                if (
186                    this.input.maxResultsPerCrawl > 0 &&
187                    requests.length >= 1.5 * this.input.maxResultsPerCrawl
188                ) {
189                    break;
190                }
191                requests.push(request);
192            }
193
194            const { waitForAllRequestsToBeAdded } =
195                await this.requestQueue.addRequestsBatched(requests);
196
197            void waitForAllRequestsToBeAdded.then(async () => {
198                await this.keyValueStore.setValue(
199                    REQUEST_QUEUE_INIT_FLAG_KEY,
200                    '1',
201                );
202            });
203        }
204
205        
206        this.dataset = await Dataset.open(this.datasetName);
207        const info = await this.dataset.getInfo();
208        this.pagesOutputted = info?.itemCount ?? 0;
209
210        
211        this.proxyConfiguration = (await Actor.createProxyConfiguration(
212            this.input.proxyConfiguration,
213        )) as any as ProxyConfiguration;
214    }
215
216    
217
218
219    async createCrawler() {
220        await this.initPromise;
221
222        const options: CheerioCrawlerOptions = {
223            proxyConfiguration: this.proxyConfiguration,
224            requestHandler: this._requestHandler.bind(this),
225            preNavigationHooks: [],
226            postNavigationHooks: [],
227            requestQueue: this.requestQueue,
228            navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
229            requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
230            ignoreSslErrors: this.input.ignoreSslErrors,
231            failedRequestHandler: this._failedRequestHandler.bind(this),
232            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
233            maxRequestRetries: this.input.maxRequestRetries,
234            maxRequestsPerCrawl:
235                this.input.maxPagesPerCrawl === 0
236                    ? undefined
237                    : this.input.maxPagesPerCrawl,
238            additionalMimeTypes: this.input.additionalMimeTypes,
239            autoscaledPoolOptions: {
240                maxConcurrency: this.input.maxConcurrency,
241                systemStatusOptions: {
242                    
243                    
244                    maxEventLoopOverloadedRatio:
245                        MAX_EVENT_LOOP_OVERLOADED_RATIO,
246                },
247            },
248            useSessionPool: true,
249            persistCookiesPerSession: true,
250            sessionPoolOptions: {
251                persistStateKeyValueStoreId: this.input.sessionPoolName
252                    ? SESSION_STORE_NAME
253                    : undefined,
254                persistStateKey: this.input.sessionPoolName,
255                sessionOptions: {
256                    maxUsageCount: this.maxSessionUsageCount,
257                },
258            },
259            experiments: {
260                requestLocking: true,
261            },
262        };
263
264        this._createNavigationHooks(options);
265
266        if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
267            options.sessionPoolOptions!.maxPoolSize = 1;
268        }
269
270        if (this.input.suggestResponseEncoding) {
271            if (this.input.forceResponseEncoding) {
272                options.forceResponseEncoding =
273                    this.input.suggestResponseEncoding;
274            } else {
275                options.suggestResponseEncoding =
276                    this.input.suggestResponseEncoding;
277            }
278        }
279
280        this.crawler = new CheerioCrawler(options);
281
282        return this.crawler;
283    }
284
285    private _createNavigationHooks(options: CheerioCrawlerOptions) {
286        options.preNavigationHooks!.push(async ({ request, session }) => {
287            
288            request.headers = Object.entries(request.headers ?? {}).reduce(
289                (newHeaders, [key, value]) => {
290                    newHeaders[key.toLowerCase()] = value;
291                    return newHeaders;
292                },
293                {} as Dictionary<string>,
294            );
295
296            
297            if (this.input.initialCookies && this.input.initialCookies.length) {
298                const cookiesToSet = session
299                    ? tools.getMissingCookiesFromSession(
300                          session,
301                          this.input.initialCookies,
302                          request.url,
303                      )
304                    : this.input.initialCookies;
305                if (cookiesToSet?.length) {
306                    
307                    session?.setCookies(cookiesToSet, request.url);
308                }
309            }
310        });
311
312        options.preNavigationHooks!.push(
313            ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
314        );
315        options.postNavigationHooks!.push(
316            ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
317        );
318    }
319
320    private _runHookWithEnhancedContext(
321        hooks: ((...args: unknown[]) => Awaitable<void>)[],
322    ) {
323        return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
324            const { customData } = this.input;
325            return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
326        });
327    }
328
329    private async _failedRequestHandler({ request }: CheerioCrawlingContext) {
330        const lastError =
331            request.errorMessages[request.errorMessages.length - 1];
332        const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
333        log.error(
334            `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
335        );
336        return this._handleResult(request, undefined, undefined, true);
337    }
338
339    
340
341
342
343
344
345
346
347
348
349    private async _requestHandler(crawlingContext: CheerioCrawlingContext) {
350        const { request, response, $, crawler } = crawlingContext;
351        const pageFunctionArguments: Dictionary = {};
352
353        
354        const props = Object.getOwnPropertyDescriptors(crawlingContext);
355        ['json', 'body'].forEach((key) => {
356            props[key].configurable = true;
357        });
358        Object.defineProperties(pageFunctionArguments, props);
359
360        pageFunctionArguments.cheerio = load([]);
361        pageFunctionArguments.response = {
362            status: response!.statusCode,
363            headers: response!.headers,
364        };
365
366        Object.defineProperties(
367            this,
368            Object.getOwnPropertyDescriptors(pageFunctionArguments),
369        );
370
371        
372
373
374        
375        
376        tools.ensureMetaData(request);
377
378        
379        const aborted = await this._handleMaxResultsPerCrawl(
380            crawler.autoscaledPool,
381        );
382        if (aborted) return;
383
384        
385        const contextOptions = {
386            crawlerSetup: {
387                rawInput: this.rawInput,
388                env: this.env,
389                globalStore: this.globalStore,
390                requestQueue: this.requestQueue,
391                keyValueStore: this.keyValueStore,
392                customData: this.input.customData,
393            },
394            pageFunctionArguments,
395        };
396        const { context, state } = createContext(contextOptions);
397
398        
399
400
401        const pageFunctionResult = await this.evaledPageFunction(context);
402
403        
404
405
406        
407        
408        
409        if (!state.skipLinks && !!$) await this._handleLinks(crawlingContext);
410
411        
412        await this._handleResult(
413            request,
414            response,
415            pageFunctionResult as Dictionary,
416        );
417    }
418
419    private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
420        if (
421            !this.input.maxResultsPerCrawl ||
422            this.pagesOutputted < this.input.maxResultsPerCrawl
423        )
424            return false;
425        if (!autoscaledPool) return false;
426        log.info(
427            `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
428        );
429        await autoscaledPool.abort();
430        return true;
431    }
432
433    private async _handleLinks({
434        request,
435        enqueueLinks,
436    }: CheerioCrawlingContext) {
437        if (!(this.input.linkSelector && this.requestQueue)) return;
438        const currentDepth = (request.userData![META_KEY] as RequestMetadata)
439            .depth;
440        const hasReachedMaxDepth =
441            this.input.maxCrawlingDepth &&
442            currentDepth >= this.input.maxCrawlingDepth;
443        if (hasReachedMaxDepth) {
444            log.debug(
445                `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
446            );
447            return;
448        }
449
450        await enqueueLinks({
451            selector: this.input.linkSelector,
452            pseudoUrls: this.input.pseudoUrls,
453            globs: this.input.globs,
454            exclude: this.input.excludes,
455            transformRequestFunction: (requestOptions) => {
456                requestOptions.userData ??= {};
457                requestOptions.userData[META_KEY] = {
458                    parentRequestId: request.id || request.uniqueKey,
459                    depth: currentDepth + 1,
460                };
461
462                requestOptions.useExtendedUniqueKey = true;
463                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
464                return requestOptions;
465            },
466        });
467    }
468
469    private async _handleResult(
470        request: Request,
471        response?: IncomingMessage,
472        pageFunctionResult?: Dictionary,
473        isError?: boolean,
474    ) {
475        const payload = tools.createDatasetPayload(
476            request,
477            response,
478            pageFunctionResult,
479            isError,
480        );
481        await this.dataset.pushData(payload);
482        this.pagesOutputted++;
483    }
484}