1import { Actor, log } from 'apify';
2
3import { getValidator } from './validator.js';
4import { maybeTranslateDatasetId } from './dataset_utils.js';
5
6await Actor.init();
7
8const input = await Actor.getInput<{
9    datasetIds: string[];
10    schema: Record<string, unknown>;
11}>();
12
13if (input === null || !input.datasetIds || !input.schema) {
14    throw new Error('Both datasetIds and schema are required.');
15}
16
17const validator = await getValidator(input.schema);
18
19const client = await Actor.apifyClient;
20
21const defaultDataset = await Actor.openDataset();
22
23for (const datasetId of input.datasetIds) {
24    let invalidItems: Array<{
25        datasetId: string;
26        itemPosition: number;
27        validationErrors: unknown;
28    }> = [];
29
30    try {
31        log.info(`Checking existence of dataset with ID/Name: ${datasetId}`);
32
33        const realDatasetId = await maybeTranslateDatasetId(datasetId, client);
34
35        log.info(`Validating dataset with ID: ${realDatasetId}`);
36
37        const dataset = await Actor.openDataset(realDatasetId, { forceCloud: true });
38
39        const info = await dataset.getInfo();
40        if (!info) throw new Error('Dataset not found!');
41
42        log.info(`Dataset contains ${info.itemCount} items`);
43        if (info.itemCount > 100000) {
44            log.warning('Dataset is very large, validation may take a long time.');
45        }
46
47        let invalidItemsCount = 0;
48
49        await dataset.forEach(async (item, index) => {
50            const isValid = validator(item);
51            if (isValid) return;
52
53            invalidItemsCount++;
54
55            invalidItems.push({
56                datasetId,
57                itemPosition: index,
58                validationErrors: validator.errors!,
59            });
60
61            if (invalidItems.length === 1000) {
62                await defaultDataset.pushData(invalidItems);
63                invalidItems = [];
64            }
65        });
66
67        
68        if (invalidItems.length > 0) {
69            try {
70                await defaultDataset.pushData(invalidItems);
71            } catch (error) {
72                
73                log.error('push failed', { error: error.message, data: error.data?.invalidItems?.[0]?.validationErrors });
74            }
75        }
76
77        log.info(`Found ${invalidItemsCount} invalid items in the dataset with ID: ${datasetId}, errors are stored in default dataset of this run.`);
78    } catch (error) {
79        log.exception(error as Error, 'Failed to get dataset', { datasetId });
80        
81    }
82}
83
84
85await Actor.exit();