1import { readFile } from 'node:fs/promises';
2import type { IncomingMessage } from 'node:http';
3import { dirname } from 'node:path';
4import { fileURLToPath, URL } from 'node:url';
5
6import type {
7 AutoscaledPool,
8 Awaitable,
9 CheerioCrawlerOptions,
10 CheerioCrawlingContext,
11 Dictionary,
12 ProxyConfiguration,
13 Request,
14} from '@crawlee/cheerio';
15import { CheerioCrawler, Dataset, KeyValueStore, log, RequestList, RequestQueueV2 } from '@crawlee/cheerio';
16import type { ApifyEnv } from 'apify';
17import { Actor } from 'apify';
18import { load } from 'cheerio';
19
20import type { CrawlerSetupOptions, RequestMetadata } from '@apify/scraper-tools';
21import { constants as scraperToolsConstants, createContext, tools } from '@apify/scraper-tools';
22
23import type { Input } from './consts.js';
24import { ProxyRotation } from './consts.js';
25
26const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
27const SCHEMA = JSON.parse(await readFile(new URL('../../INPUT_SCHEMA.json', import.meta.url), 'utf8'));
28
29const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
30const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
31const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
32
33
34
35
36
37export class CrawlerSetup implements CrawlerSetupOptions {
38 name = 'Cheerio Scraper';
39 rawInput: string;
40 env: ApifyEnv;
41
42
43
44 globalStore = new Map();
45 requestQueue: RequestQueueV2;
46 keyValueStore: KeyValueStore;
47 customData: unknown;
48 input: Input;
49 maxSessionUsageCount: number;
50 evaledPageFunction: (...args: unknown[]) => unknown;
51 evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
52 evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable<void>)[];
53 datasetName?: string;
54 keyValueStoreName?: string;
55 requestQueueName?: string;
56
57 crawler!: CheerioCrawler;
58 dataset!: Dataset;
59 pagesOutputted!: number;
60 proxyConfiguration?: ProxyConfiguration;
61 private initPromise: Promise<void>;
62
63 constructor(input: Input) {
64
65 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
66
67
68 this.rawInput = JSON.stringify(input);
69
70
71 tools.maybeLoadPageFunctionFromDisk(input, dirname(fileURLToPath(import.meta.url)));
72
73
74 if (!Actor.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
75
76 this.input = input;
77 this.env = Actor.getEnv();
78
79
80 this.input.pseudoUrls.forEach((purl) => {
81 if (!tools.isPlainObject(purl)) {
82 throw new Error('The pseudoUrls Array must only contain Objects.');
83 }
84 if (purl.userData && !tools.isPlainObject(purl.userData)) {
85 throw new Error('The userData property of a pseudoUrl must be an Object.');
86 }
87 });
88
89 this.input.initialCookies.forEach((cookie) => {
90 if (!tools.isPlainObject(cookie)) {
91 throw new Error('The initialCookies Array must only contain Objects.');
92 }
93 });
94
95
96 this.maxSessionUsageCount = SESSION_MAX_USAGE_COUNTS[this.input.proxyRotation];
97
98
99 this.evaledPageFunction = tools.evalFunctionOrThrow(this.input.pageFunction);
100
101 if (this.input.preNavigationHooks) {
102 this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(
103 this.input.preNavigationHooks,
104 'preNavigationHooks',
105 );
106 } else {
107 this.evaledPreNavigationHooks = [];
108 }
109
110 if (this.input.postNavigationHooks) {
111 this.evaledPostNavigationHooks = tools.evalFunctionArrayOrThrow(
112 this.input.postNavigationHooks,
113 'postNavigationHooks',
114 );
115 } else {
116 this.evaledPostNavigationHooks = [];
117 }
118
119
120 this.datasetName = this.input.datasetName;
121 this.keyValueStoreName = this.input.keyValueStoreName;
122 this.requestQueueName = this.input.requestQueueName;
123
124
125 this.crawler = null!;
126 this.requestQueue = null!;
127 this.dataset = null!;
128 this.keyValueStore = null!;
129 this.proxyConfiguration = null!;
130 this.initPromise = this._initializeAsync();
131 }
132
133 private async _initializeAsync() {
134
135 const startUrls = this.input.startUrls.map((req) => {
136 req.useExtendedUniqueKey = true;
137 req.keepUrlFragment = this.input.keepUrlFragments;
138 return req;
139 });
140
141
142 this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
143
144
145 this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
146
147 if (!(await this.keyValueStore.recordExists(REQUEST_QUEUE_INIT_FLAG_KEY))) {
148 const requests: Request[] = [];
149 for await (const request of await RequestList.open(null, startUrls)) {
150 if (this.input.maxResultsPerCrawl > 0 && requests.length >= 1.5 * this.input.maxResultsPerCrawl) {
151 break;
152 }
153 requests.push(request);
154 }
155
156 const { waitForAllRequestsToBeAdded } = await this.requestQueue.addRequestsBatched(requests);
157
158 void waitForAllRequestsToBeAdded.then(async () => {
159 await this.keyValueStore.setValue(REQUEST_QUEUE_INIT_FLAG_KEY, '1');
160 });
161 }
162
163
164 this.dataset = await Dataset.open(this.datasetName);
165 const info = await this.dataset.getInfo();
166 this.pagesOutputted = info?.itemCount ?? 0;
167
168
169 this.proxyConfiguration = (await Actor.createProxyConfiguration(
170 this.input.proxyConfiguration,
171 )) as any as ProxyConfiguration;
172 }
173
174
175
176
177 async createCrawler() {
178 await this.initPromise;
179
180 const options: CheerioCrawlerOptions = {
181 proxyConfiguration: this.proxyConfiguration,
182 requestHandler: this._requestHandler.bind(this),
183 preNavigationHooks: [],
184 postNavigationHooks: [],
185 requestQueue: this.requestQueue,
186 navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
187 requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
188 ignoreSslErrors: this.input.ignoreSslErrors,
189 failedRequestHandler: this._failedRequestHandler.bind(this),
190 respectRobotsTxtFile: this.input.respectRobotsTxtFile,
191 maxRequestRetries: this.input.maxRequestRetries,
192 maxRequestsPerCrawl: this.input.maxPagesPerCrawl === 0 ? undefined : this.input.maxPagesPerCrawl,
193 additionalMimeTypes: this.input.additionalMimeTypes,
194 autoscaledPoolOptions: {
195 maxConcurrency: this.input.maxConcurrency,
196 systemStatusOptions: {
197
198
199 maxEventLoopOverloadedRatio: MAX_EVENT_LOOP_OVERLOADED_RATIO,
200 },
201 },
202 useSessionPool: true,
203 persistCookiesPerSession: true,
204 sessionPoolOptions: {
205 persistStateKeyValueStoreId: this.input.sessionPoolName ? SESSION_STORE_NAME : undefined,
206 persistStateKey: this.input.sessionPoolName,
207 sessionOptions: {
208 maxUsageCount: this.maxSessionUsageCount,
209 },
210 },
211 experiments: {
212 requestLocking: true,
213 },
214 };
215
216 this._createNavigationHooks(options);
217
218 if (this.input.proxyRotation === ProxyRotation.UntilFailure) {
219 options.sessionPoolOptions!.maxPoolSize = 1;
220 }
221
222 if (this.input.suggestResponseEncoding) {
223 if (this.input.forceResponseEncoding) {
224 options.forceResponseEncoding = this.input.suggestResponseEncoding;
225 } else {
226 options.suggestResponseEncoding = this.input.suggestResponseEncoding;
227 }
228 }
229
230 this.crawler = new CheerioCrawler(options);
231
232 return this.crawler;
233 }
234
235 private _createNavigationHooks(options: CheerioCrawlerOptions) {
236 options.preNavigationHooks!.push(async ({ request, session }) => {
237
238 request.headers = Object.entries(request.headers ?? {}).reduce((newHeaders, [key, value]) => {
239 newHeaders[key.toLowerCase()] = value;
240 return newHeaders;
241 }, {} as Dictionary<string>);
242
243
244 if (this.input.initialCookies && this.input.initialCookies.length) {
245 const cookiesToSet = session
246 ? tools.getMissingCookiesFromSession(session, this.input.initialCookies, request.url)
247 : this.input.initialCookies;
248 if (cookiesToSet?.length) {
249
250 session?.setCookies(cookiesToSet, request.url);
251 }
252 }
253 });
254
255 options.preNavigationHooks!.push(...this._runHookWithEnhancedContext(this.evaledPreNavigationHooks));
256 options.postNavigationHooks!.push(...this._runHookWithEnhancedContext(this.evaledPostNavigationHooks));
257 }
258
259 private _runHookWithEnhancedContext(hooks: ((...args: unknown[]) => Awaitable<void>)[]) {
260 return hooks.map((hook) => (ctx: Dictionary, ...args: unknown[]) => {
261 const { customData } = this.input;
262 return hook({ ...ctx, Apify: Actor, Actor, customData }, ...args);
263 });
264 }
265
266 private async _failedRequestHandler({ request }: CheerioCrawlingContext) {
267 const lastError = request.errorMessages[request.errorMessages.length - 1];
268 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
269 log.error(
270 `Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`,
271 );
272 return this._handleResult(request, undefined, undefined, true);
273 }
274
275
276
277
278
279
280
281
282
283
284
285 private async _requestHandler(crawlingContext: CheerioCrawlingContext) {
286 const { request, response, $, crawler } = crawlingContext;
287 const pageFunctionArguments: Dictionary = {};
288
289
290 const props = Object.getOwnPropertyDescriptors(crawlingContext);
291 ['json', 'body'].forEach((key) => {
292 props[key].configurable = true;
293 });
294 Object.defineProperties(pageFunctionArguments, props);
295
296 pageFunctionArguments.cheerio = load([]);
297 pageFunctionArguments.response = {
298 status: response!.statusCode,
299 headers: response!.headers,
300 };
301
302 Object.defineProperties(this, Object.getOwnPropertyDescriptors(pageFunctionArguments));
303
304
305
306
307
308
309 tools.ensureMetaData(request);
310
311
312 const aborted = await this._handleMaxResultsPerCrawl(crawler.autoscaledPool);
313 if (aborted) return;
314
315
316 const contextOptions = {
317 crawlerSetup: {
318 rawInput: this.rawInput,
319 env: this.env,
320 globalStore: this.globalStore,
321 requestQueue: this.requestQueue,
322 keyValueStore: this.keyValueStore,
323 customData: this.input.customData,
324 },
325 pageFunctionArguments,
326 };
327 const { context, state } = createContext(contextOptions);
328
329
330
331
332 const pageFunctionResult = await this.evaledPageFunction(context);
333
334
335
336
337
338
339
340 if (!state.skipLinks && !!$) await this._handleLinks(crawlingContext);
341
342
343 await this._handleResult(request, response, pageFunctionResult as Dictionary);
344 }
345
346 private async _handleMaxResultsPerCrawl(autoscaledPool?: AutoscaledPool) {
347 if (!this.input.maxResultsPerCrawl || this.pagesOutputted < this.input.maxResultsPerCrawl) return false;
348 if (!autoscaledPool) return false;
349 log.info(`User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`);
350 await autoscaledPool.abort();
351 return true;
352 }
353
354 private async _handleLinks({ request, enqueueLinks }: CheerioCrawlingContext) {
355 if (!(this.input.linkSelector && this.requestQueue)) return;
356 const currentDepth = (request.userData![META_KEY] as RequestMetadata).depth;
357 const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
358 if (hasReachedMaxDepth) {
359 log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
360 return;
361 }
362
363 await enqueueLinks({
364 selector: this.input.linkSelector,
365 pseudoUrls: this.input.pseudoUrls,
366 globs: this.input.globs,
367 exclude: this.input.excludes,
368 transformRequestFunction: (requestOptions) => {
369 requestOptions.userData ??= {};
370 requestOptions.userData[META_KEY] = {
371 parentRequestId: request.id || request.uniqueKey,
372 depth: currentDepth + 1,
373 };
374
375 requestOptions.useExtendedUniqueKey = true;
376 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
377 return requestOptions;
378 },
379 });
380 }
381
382 private async _handleResult(
383 request: Request,
384 response?: IncomingMessage,
385 pageFunctionResult?: Dictionary,
386 isError?: boolean,
387 ) {
388 const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
389 await this.dataset.pushData(payload);
390 this.pagesOutputted++;
391 }
392}