Validate Dataset(s) with JSON Schema avatar
Validate Dataset(s) with JSON Schema

Pricing

Pay per usage

Go to Store
Validate Dataset(s) with JSON Schema

Validate Dataset(s) with JSON Schema

Developed by

Jaroslav Hejlek

Maintained by Community

This Actor validates items in one or more datasets against a provided JSON Schema. Use it if you planning to add a dataset validation schema to your actor and you want test it.

0.0 (0)

Pricing

Pay per usage

0

Monthly users

1

Runs succeeded

83%

Last modified

13 days ago

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
15
16# dist folder
17dist

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.vscode
5storage
6apify_storage
7crawlee_storage
8node_modules
9dist
10tsconfig.tsbuildinfo
11
12# Added by Apify CLI
13.venv

.nvmrc

1v20.13.1

package.json

1{
2	"name": "validate-dataset-with-json-schema",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "Validates one or more datasets against provided JSON schema.",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"ajv": "^8.6.2",
11		"apify": "^3.2.6"
12	},
13	"devDependencies": {
14		"@apify/eslint-config-ts": "^0.3.0",
15		"@apify/tsconfig": "^0.1.0",
16		"@typescript-eslint/eslint-plugin": "^7.18.0",
17		"@typescript-eslint/parser": "^7.18.0",
18		"eslint": "^8.50.0",
19		"tsx": "^4.6.2",
20		"typescript": "^5.3.3"
21	},
22	"scripts": {
23		"start": "npm run start:dev",
24		"start:prod": "node dist/main.js",
25		"start:dev": "tsx src/main.ts",
26		"build": "tsc",
27		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
28	},
29	"author": "It's not you it's me",
30	"license": "ISC"
31}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"],
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55# Create and run as a non-root user.
56RUN adduser -h /home/apify -D apify && \
57    chown -R apify:apify ./
58USER apify
59
60# Run the image.
61CMD npm run start:prod --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "validate-dataset-with-json-schema",
4    "title": "Validate dataset(s) with schema",
5    "description": "Validates one or more datasets against provided JSON schema.",
6    "version": "0.0",
7    "buildTag": "latest",
8    "input": "input_schema.json",
9    "storages": {
10        "dataset": {
11            "actorSpecification": 1,
12            "fields": {
13                "$schema": "http://json-schema.org/draft-07/schema#",
14                "type": "object",
15                "properties": {
16                    "datasetId": {
17                        "type": "string"
18                    },
19                    "itemPosition": {
20                        "type": "number"
21                    },
22                    "validationErrors": {
23                        "type": "array",
24                        "items": {
25                            "type": "object",
26                            "additionalProperties": true
27                        }
28                    }
29                },
30                "required": ["datasetId", "itemPosition", "validationErrors"]
31            },
32            "views": {
33                "overview": {
34                    "title": "Overview",
35                    "display": {
36                        "component": "table",
37                        "properties": {
38                            "datasetId": {
39                                "label": "Dataset ID",
40                                "format": "text"
41                            },
42                            "itemPosition": {
43                                "label": "Position",
44                                "format": "number"
45                            },
46                            "validationErrors": {
47                                "label": "Errors",
48                                "format": "object"
49                            }
50                        }
51                    }
52                }
53            }
54        }
55    },
56    "meta": {
57        "templateId": "ts-empty"
58    },
59    "dockerfile": "./Dockerfile"
60}

.actor/input_schema.json

1{
2    "title": "Validate dataset(s) with schema",
3    "description": "Validates one or more datasets against provided JSON schema.",
4    "type": "object",
5    "schemaVersion": 1,
6    "properties": {
7        "datasetIds": {
8            "type": "array",
9            "resourceType": "dataset",
10            "title": "Dataset ID(s)",
11            "description": "Provide ID(s} of datasets to validate again schema."
12        },
13        "schema": {
14            "type": "object",
15            "editor": "json",
16            "title": "JSON Schema",
17            "description": "The JSON schema to validate the dataset items against.",
18            "prefill": {
19                "$schema": "http://json-schema.org/draft-07/schema#",
20                "type": "object",
21                "properties": {
22
23                },
24                "required": []
25            }
26        }
27    },
28    "required": ["datasetIds", "schema"]
29}

src/dataset_utils.ts

1import { type ApifyClient } from 'apify';
2
3export async function maybeTranslateDatasetId(datasetId: string, client: ApifyClient) {
4    if (!datasetId) throw new Error('Dataset ID is required.');
5    const dataset = await client.dataset(datasetId).get();
6
7    // If dataset was found the ID is valid and we do not need to do anything else
8    if (dataset) return dataset.id;
9
10    // The dataset was not found. Let's check if maybe the user provided Run ID
11    const run = await client.run(datasetId).get();
12    if (run) return run.defaultDatasetId;
13
14    throw new Error('Dataset not found');
15}

src/main.ts

1import { Actor, log } from 'apify';
2
3import { getValidator } from './validator.js';
4import { maybeTranslateDatasetId } from './dataset_utils.js';
5
6await Actor.init();
7
8const input = await Actor.getInput<{
9    datasetIds: string[];
10    schema: Record<string, unknown>;
11}>();
12
13if (input === null || !input.datasetIds || !input.schema) {
14    throw new Error('Both datasetIds and schema are required.');
15}
16
17const validator = await getValidator(input.schema);
18
19const client = await Actor.apifyClient;
20
21const defaultDataset = await Actor.openDataset();
22
23for (const datasetId of input.datasetIds) {
24    let invalidItems: Array<{
25        datasetId: string;
26        itemPosition: number;
27        validationErrors: unknown;
28    }> = [];
29
30    try {
31        log.info(`Checking existence of dataset with ID/Name: ${datasetId}`);
32
33        const realDatasetId = await maybeTranslateDatasetId(datasetId, client);
34
35        log.info(`Validating dataset with ID: ${realDatasetId}`);
36
37        const dataset = await Actor.openDataset(realDatasetId, { forceCloud: true });
38
39        const info = await dataset.getInfo();
40        if (!info) throw new Error('Dataset not found!');
41
42        log.info(`Dataset contains ${info.itemCount} items`);
43        if (info.itemCount > 100000) {
44            log.warning('Dataset is very large, validation may take a long time.');
45        }
46
47        let invalidItemsCount = 0;
48
49        await dataset.forEach(async (item, index) => {
50            const isValid = validator(item);
51            if (isValid) return;
52
53            invalidItemsCount++;
54
55            invalidItems.push({
56                datasetId,
57                itemPosition: index,
58                validationErrors: validator.errors!,
59            });
60
61            if (invalidItems.length === 1000) {
62                await defaultDataset.pushData(invalidItems);
63                invalidItems = [];
64            }
65        });
66
67        // Push any remaining items
68        if (invalidItems.length > 0) {
69            try {
70                await defaultDataset.pushData(invalidItems);
71            } catch (error) {
72                // @ts-expect-error We are using properties we are not sure exist
73                log.error('push failed', { error: error.message, data: error.data?.invalidItems?.[0]?.validationErrors });
74            }
75        }
76
77        log.info(`Found ${invalidItemsCount} invalid items in the dataset with ID: ${datasetId}, errors are stored in default dataset of this run.`);
78    } catch (error) {
79        log.exception(error as Error, 'Failed to get dataset', { datasetId });
80        await Actor.fail(`Failed to get dataset with ID: ${datasetId}`);
81    }
82}
83
84// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
85await Actor.exit();

src/validator.ts

1import { Actor, log } from 'apify';
2import Ajv from 'ajv';
3
4export async function getValidator(schema: Record<string, unknown>) {
5    // Check if the schema is valid JSON schema and initialize the validator
6    // eslint-disable-next-line @typescript-eslint/ban-ts-comment
7    // @ts-ignore Not sure what is the reason for the error...
8    const ajv = new Ajv({ strict: false, unicodeRegExp: false, allErrors: true });
9    let validator;
10    try {
11        validator = ajv.compile(schema);
12    } catch (error) {
13        log.exception(error as Error, 'Failed to create validator');
14        await Actor.fail('Invalid schema provided. Please provide a valid JSON schema.');
15    }
16
17    log.info('Initialized validator using provided schema');
18
19    return validator;
20}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.