1import { readFile } from 'node:fs/promises';
2import type { IncomingMessage } from 'node:http';
3import { dirname } from 'node:path';
4import { fileURLToPath, URL } from 'node:url';
5
6import type {
7 AutoscaledPool,
8 Awaitable,
9 CheerioCrawlerOptions,
10 CheerioCrawlingContext,
11 Dictionary,
12 ProxyConfiguration,
13 Request,
14} from '@crawlee/cheerio';
15import {
16 CheerioCrawler,
17 Dataset,
18 KeyValueStore,
19 log,
20 RequestList,
21 RequestQueueV2,
22} from '@crawlee/cheerio';
23import type { ApifyEnv } from 'apify';
24import { Actor } from 'apify';
25import { load } from 'cheerio';
26
27import type {
28 CrawlerSetupOptions,
29 RequestMetadata,
30} from '@apify/scraper-tools';
31import {
32 constants as scraperToolsConstants,
33 createContext,
34 tools,
35} from '@apify/scraper-tools';
36
37import type { Input } from './consts.js';
38import { ProxyRotation } from './consts.js';
39
40const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
41const SCHEMA = JSON.parse(
42 await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'),
43);
44
45const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
46const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
47const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
48
49
50
51
52
53export class CrawlerSetup implements CrawlerSetupOptions {
54 name = 'Cheerio Scraper';
55 rawInput: string;
56 env: ApifyEnv;
57
58
59
60 globalStore = new Map();
61 requestQueue: RequestQueueV2;
62 keyValueStore: KeyValueStore;
63 customData: unknown;
64 input: Input;
65 maxSessionUsageCount: number;
66 evaledPageFunction: (...args: unknown[]) => unknown;
67 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
68 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
69 datasetName?: string;
70 keyValueStoreName?: string;
71 requestQueueName?: string;
72
73 crawler!: CheerioCrawler;
74 dataset!: Dataset;
75 pagesOutputted!: number;
76 proxyConfiguration?: ProxyConfiguration;
77 private initPromise: Promise<void>;
78
79 constructor(input: Input) {
80
81 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
82
83
84 this.rawInput = JSON.stringify(input);
85
86
87 tools.maybeLoadPageFunctionFromDisk(
88 input,
89 dirname(fileURLToPath(import.meta.url)),
90 );
91
92
93 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
94
95 this.input = input;
96 this.env = Actor.getEnv();
97
98
99 this.input.pseudoUrls.forEach((purl) => {
100 if (!tools.isPlainObject(purl)) {
101 throw new Error(
102 'The pseudoUrls Array must only contain Objects.',
103 );
104 }
105 if (purl.userData && !tools.isPlainObject(purl.userData)) {
106 throw new Error(
107 'The userData property of a pseudoUrl must be an Object.',
108 );
109 }
110 });
111
112 this.input.initialCookies.forEach((cookie) => {
113 if (!tools.isPlainObject(cookie)) {
114 throw new Error(
115 'The initialCookies Array must only contain Objects.',
116 );
117 }
118 });
119
120
121 this.maxSessionUsageCount =
122 SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
123
124
125 this.evaledPageFunction = tools.evalFunctionOrThrow(
126 this.input.pageFunction,
127 );
128
129 if (this.input.preNavigationHooks) {
130 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
131 this.input.preNavigationHooks,
132 'preNavigationHooks',
133 );
134 } else {
135 this.evaledPreNavigationHooks = [];
136 }
137
138 if (this.input.postNavigationHooks) {
139 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
140 this.input.postNavigationHooks,
141 'postNavigationHooks',
142 );
143 } else {
144 this.evaledPostNavigationHooks = [];
145 }
146
147
148 this.datasetName = this.input.datasetName;
149 this.keyValueStoreName = this.input.keyValueStoreName;
150 this.requestQueueName = this.input.requestQueueName;
151
152
153 this.crawler = null!;
154 this.requestQueue = null!;
155 this.dataset = null!;
156 this.keyValueStore = null!;
157 this.proxyConfiguration = null!;
158 this.initPromise = this._initializeAsync();
159 }
160
161 private async _initializeAsync() {
162
163 const startUrls = this.input.startUrls.map((req) => {
164 req.useExtendedUniqueKey = true;
165 req.keepUrlFragment = this.input.keepUrlFragments;
166 return req;
167 });
168
169
170 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
171
172
173 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
174
175 if (
176 !(await this.keyValueStore.recordExists(
177 REQUEST_QUEUE_INIT_FLAG_KEY,
178 ))
179 ) {
180 const requests: Request[] = [];
181 for await (const request of await RequestList.open(
182 null,
183 startUrls,
184 )) {
185 if (
186 this.input.maxResultsPerCrawl > 0 &&
187 requests.length >= 1.5 * this.input.maxResultsPerCrawl
188 ) {
189 break;
190 }
191 requests.push(request);
192 }
193
194 const { waitForAllRequestsToBeAdded } =
195 await this.requestQueue.addRequestsBatched(requests);
196
197 void waitForAllRequestsToBeAdded.then(async () => {
198 await this.keyValueStore.setValue(
199 REQUEST_QUEUE_INIT_FLAG_KEY,
200 '1',
201 );
202 });
203 }
204
205
206 this.dataset = await Dataset.open(this.datasetName);
207 const info = await this.dataset.getInfo();
208 this.pagesOutputted = info?.itemCount ?? 0;
209
210
211 this.proxyConfiguration = (await Actor.createProxyConfiguration(
212 this.input.proxyConfiguration,
213 )) as any as ProxyConfiguration;
214 }
215
216
217
218
219 async createCrawler() {
220 await this.initPromise;
221
222 const options: CheerioCrawlerOptions = {
223 proxyConfiguration: this.proxyConfiguration,
224 requestHandler: this._requestHandler.bind(this),
225 preNavigationHooks: [],
226 postNavigationHooks: [],
227 requestQueue: this.requestQueue,
228 navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
229 requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
230 ignoreSslErrors: this.input.ignoreSslErrors,
231 failedRequestHandler: this._failedRequestHandler.bind(this),
232 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
233 maxRequestRetries: this.input.maxRequestRetries,
234 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
235 additionalMimeTypes: this.input.additionalMimeTypes,
236 autoscaledPoolOptions: {
237 maxConcurrency: this.input.maxConcurrency,
238 systemStatusOptions: {
239
240
241 maxEventLoopOverloadedRatio:
242 MAX_EVENT_LOOP_OVERLOADED_RATIO,
243 },
244 },
245 useSessionPool: true,
246 persistCookiesPerSession: true,
247 sessionPoolOptions: {
248 persistStateKeyValueStoreId: this.input.sessionPoolName
249 ? SESSION_STORE_NAME
250 : undefined,
251 persistStateKey: this.input.sessionPoolName,
252 sessionOptions: {
253 maxUsageCount: this.maxSessionUsageCount,
254 },
255 },
256 experiments: {
257 requestLocking: true,
258 },
259 };
260
261 this._createNavigationHooks(options);
262
263 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
264 options.sessionPoolOptions!.maxPoolSize = 1;
265 }
266
267 if (this.input.suggestResponseEncoding) {
268 if (this.input.forceResponseEncoding) {
269 options.forceResponseEncoding =
270 this.input.suggestResponseEncoding;
271 } else {
272 options.suggestResponseEncoding =
273 this.input.suggestResponseEncoding;
274 }
275 }
276
277 this.crawler = new CheerioCrawler(options);
278
279 return this.crawler;
280 }
281
282 private _createNavigationHooks(options: CheerioCrawlerOptions) {
283 options.preNavigationHooks!.push(async ({ request, session }) => {
284
285 request.headers = Object.entries(request.headers ?? {}).reduce(
286 (newHeaders, [key, value]) => {
287 newHeaders[key.toLowerCase()] = value;
288 return newHeaders;
289 },
290 {} as Dictionary<string>,
291 );
292
293
294 if (this.input.initialCookies && this.input.initialCookies.length) {
295 const cookiesToSet = session
296 ? tools.getMissingCookiesFromSession(
297 session,
298 this.input.initialCookies,
299 request.url,
300 )
301 : this.input.initialCookies;
302 if (cookiesToSet?.length) {
303
304 session?.setCookies(cookiesToSet, request.url);
305 }
306 }
307 });
308
309 options.preNavigationHooks!.push(
310 ...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks),
311 );
312 options.postNavigationHooks!.push(
313 ...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks),
314 );
315 }
316
317 private _runHookWithEnhancedContext(
318 hooks: ((...args: unknown[]) => Awaitable<void>)[],
319 ) {
320 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
321 const { customData } = this.input;
322 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
323 });
324 }
325
326 private async _failedRequestHandler({ request }: CheerioCrawlingContext) {
327 const lastError =
328 request.errorMessages[request.errorMessages.length - 1];
329 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
330 log.error(
331 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
332 );
333 return this._handleResult(request, undefined, undefined, true);
334 }
335
336
337
338
339
340
341
342
343
344
345
346 private async _requestHandler(crawlingContext: CheerioCrawlingContext) {
347 const { request, response, $, crawler } = crawlingContext;
348 const pageFunctionArguments: Dictionary = {};
349
350
351 const props = Object.getOwnPropertyDescriptors(crawlingContext);
352 ['json', 'body'].forEach((key) => {
353 props[key].configurable = true;
354 });
355 Object.defineProperties(pageFunctionArguments, props);
356
357 pageFunctionArguments.cheerio = load([]);
358 pageFunctionArguments.response = {
359 status: response!.statusCode,
360 headers: response!.headers,
361 };
362
363 Object.defineProperties(
364 this,
365 Object.getOwnPropertyDescriptors(pageFunctionArguments),
366 );
367
368
369
370
371
372
373 tools.ensureMetaData(request);
374
375
376 const aborted = await this._handleMaxResultsPerCrawl(
377 crawler.autoscaledPool,
378 );
379 if (aborted) return;
380
381
382 const contextOptions = {
383 crawlerSetup: {
384 rawInput: this.rawInput,
385 env: this.env,
386 globalStore: this.globalStore,
387 requestQueue: this.requestQueue,
388 keyValueStore: this.keyValueStore,
389 customData: this.input.customData,
390 },
391 pageFunctionArguments,
392 };
393 const { context, state } = createContext(contextOptions);
394
395
396
397
398 const pageFunctionResult = await this.evaledPageFunction(context);
399
400
401
402
403
404
405
406 if (!state.skipLinks && !!$) await this._handleLinks(crawlingContext);
407
408
409 await this._handleResult(
410 request,
411 response,
412 pageFunctionResult as Dictionary,
413 );
414 }
415
416 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
417 if (
418 !this.input.maxResultsPerCrawl ||
419 this.pagesOutputted < this.input.maxResultsPerCrawl
420 )
421 return false;
422 if (!autoscaledPool) return false;
423 log.info(
424 `User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`,
425 );
426 await autoscaledPool.abort();
427 return true;
428 }
429
430 private async _handleLinks({
431 request,
432 enqueueLinks,
433 }: CheerioCrawlingContext) {
434 if (!(this.input.linkSelector && this.requestQueue)) return;
435 const currentDepth = (request.userData![META_KEY] as RequestMetadata)
436 .depth;
437 const hasReachedMaxDepth =
438 this.input.maxCrawlingDepth &&
439 currentDepth >= this.input.maxCrawlingDepth;
440 if (hasReachedMaxDepth) {
441 log.debug(
442 `Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`,
443 );
444 return;
445 }
446
447 await enqueueLinks({
448 selector: this.input.linkSelector,
449 pseudoUrls: this.input.pseudoUrls,
450 globs: this.input.globs,
451 exclude: this.input.excludes,
452 transformRequestFunction: (requestOptions) => {
453 requestOptions.userData ??= {};
454 requestOptions.userData[META_KEY] = {
455 parentRequestId: request.id || request.uniqueKey,
456 depth: currentDepth + 1,
457 };
458
459 requestOptions.useExtendedUniqueKey = true;
460 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
461 return requestOptions;
462 },
463 });
464 }
465
466 private async _handleResult(
467 request: Request,
468 response?: IncomingMessage,
469 pageFunctionResult?: Dictionary,
470 isError?: boolean,
471 ) {
472 const payload = tools.createDatasetPayload(
473 request,
474 response,
475 pageFunctionResult,
476 isError,
477 );
478 await this.dataset.pushData(payload);
479 this.pagesOutputted++;
480 }
481}