1import { Actor } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3
4import { Dataset, createCheerioRouter } from 'crawlee';
5
6
7const parseCheckFunction = (fnStr) => {
8 try {
9 const checkFunction = eval(fnStr);
10 if (typeof checkFunction !== 'function') {
11 throw new Error('Check function is not a function');
12 }
13 return checkFunction;
14 } catch(err) {
15 console.error(err);
16 throw new Error('Cannot parse check function');
17 }
18}
19
20
21await Actor.init();
22
23
24const {
25 startUrls,
26 checkFunction: checkFunctionString,
27 proxyConfiguration: proxyConfigurationOptions,
28} = await Actor.getInput();
29
30
31const checkFunction = parseCheckFunction(checkFunctionString);
32
33
34const proxyConfiguration = await Actor.createProxyConfiguration(proxyConfigurationOptions);
35
36
37let { failedChecksCount } = await Actor.getValue('my-crawling-state') || { failedChecksCount: 0 };
38Actor.on('migrating', () => {
39 Actor.setValue('my-crawling-state', { failedChecksCount });
40});
41
42const router = createCheerioRouter();
43
44router.addDefaultHandler(async (arg, ...rest) => {
45 const isOk = await checkFunction(arg, ...rest);
46 if (!isOk) failedChecksCount++;
47 await Dataset.pushData({
48 url: arg.request.loadedUrl,
49 isOk,
50 });
51});
52
53const crawler = new CheerioCrawler({
54 proxyConfiguration,
55 requestHandler: router,
56});
57
58await crawler.run(startUrls);
59
60if (failedChecksCount > 0) {
61 await Actor.fail('Check failed on some of the pages');
62} else {
63 await Actor.exit();
64}