1import { Actor, log } from 'apify';
2
3import { getValidator } from './validator.js';
4import { maybeTranslateDatasetId } from './dataset_utils.js';
5
6await Actor.init();
7
8const input = await Actor.getInput<{
9 datasetIds: string[];
10 schema: Record<string, unknown>;
11}>();
12
13if (input === null || !input.datasetIds || !input.schema) {
14 throw new Error('Both datasetIds and schema are required.');
15}
16
17const validator = await getValidator(input.schema);
18
19const client = await Actor.apifyClient;
20
21const defaultDataset = await Actor.openDataset();
22
23for (const datasetId of input.datasetIds) {
24 let invalidItems: Array<{
25 datasetId: string;
26 itemPosition: number;
27 validationErrors: unknown;
28 }> = [];
29
30 try {
31 log.info(`Checking existence of dataset with ID/Name: ${datasetId}`);
32
33 const realDatasetId = await maybeTranslateDatasetId(datasetId, client);
34
35 log.info(`Validating dataset with ID: ${realDatasetId}`);
36
37 const dataset = await Actor.openDataset(realDatasetId, { forceCloud: true });
38
39 const info = await dataset.getInfo();
40 if (!info) throw new Error('Dataset not found!');
41
42 log.info(`Dataset contains ${info.itemCount} items`);
43 if (info.itemCount > 100000) {
44 log.warning('Dataset is very large, validation may take a long time.');
45 }
46
47 let invalidItemsCount = 0;
48
49 await dataset.forEach(async (item, index) => {
50 const isValid = validator(item);
51 if (isValid) return;
52
53 invalidItemsCount++;
54
55 invalidItems.push({
56 datasetId,
57 itemPosition: index,
58 validationErrors: validator.errors!,
59 });
60
61 if (invalidItems.length === 1000) {
62 await defaultDataset.pushData(invalidItems);
63 invalidItems = [];
64 }
65 });
66
67
68 if (invalidItems.length > 0) {
69 try {
70 await defaultDataset.pushData(invalidItems);
71 } catch (error) {
72
73 log.error('push failed', { error: error.message, data: error.data?.invalidItems?.[0]?.validationErrors });
74 }
75 }
76
77 log.info(`Found ${invalidItemsCount} invalid items in the dataset with ID: ${datasetId}, errors are stored in default dataset of this run.`);
78 } catch (error) {
79 log.exception(error as Error, 'Failed to get dataset', { datasetId });
80
81 }
82}
83
84
85await Actor.exit();