Validate Dataset(s) with JSON Schema avatar
Validate Dataset(s) with JSON Schema

Pricing

Pay per usage

Go to Store
Validate Dataset(s) with JSON Schema

Validate Dataset(s) with JSON Schema

Developed by

Jaroslav Hejlek

Jaroslav Hejlek

Maintained by Community

This Actor validates items in one or more datasets against a provided JSON Schema. Use it if you planning to add a dataset validation schema to your actor and you want test it.

0.0 (0)

Pricing

Pay per usage

0

Total users

1

Monthly users

1

Runs succeeded

50%

Last modified

a month ago

.dockerignore

# configurations
.idea
.vscode
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git
# dist folder
dist

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"root": true,
"env": {
"browser": true,
"es2020": true,
"node": true
},
"extends": [
"@apify/eslint-config-ts"
],
"parserOptions": {
"project": "./tsconfig.json",
"ecmaVersion": 2020
},
"ignorePatterns": [
"node_modules",
"dist",
"**/*.d.ts"
]
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.vscode
storage
apify_storage
crawlee_storage
node_modules
dist
tsconfig.tsbuildinfo
# Added by Apify CLI
.venv

.nvmrc

v20.13.1

package.json

{
"name": "validate-dataset-with-json-schema",
"version": "0.0.1",
"type": "module",
"description": "Validates one or more datasets against provided JSON schema.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"ajv": "^8.6.2",
"apify": "^3.2.6"
},
"devDependencies": {
"@apify/eslint-config-ts": "^0.3.0",
"@apify/tsconfig": "^0.1.0",
"@typescript-eslint/eslint-plugin": "^7.18.0",
"@typescript-eslint/parser": "^7.18.0",
"eslint": "^8.50.0",
"tsx": "^4.6.2",
"typescript": "^5.3.3"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

tsconfig.json

{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"skipLibCheck": true,
"lib": ["DOM"],
},
"include": [
"./src/**/*"
]
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20 AS builder
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set
# in the base image.
COPY . ./
# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build
# Create final image
FROM apify/actor-node:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Copy built JS files from builder image
COPY --from=builder /usr/src/app/dist ./dist
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Create and run as a non-root user.
RUN adduser -h /home/apify -D apify && \
chown -R apify:apify ./
USER apify
# Run the image.
CMD npm run start:prod --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "validate-dataset-with-json-schema",
"title": "Validate dataset(s) with schema",
"description": "Validates one or more datasets against provided JSON schema.",
"version": "0.0",
"buildTag": "latest",
"input": "input_schema.json",
"storages": {
"dataset": {
"actorSpecification": 1,
"fields": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"datasetId": {
"type": "string"
},
"itemPosition": {
"type": "number"
},
"validationErrors": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": true
}
}
},
"required": ["datasetId", "itemPosition", "validationErrors"]
},
"views": {
"overview": {
"title": "Overview",
"display": {
"component": "table",
"properties": {
"datasetId": {
"label": "Dataset ID",
"format": "text"
},
"itemPosition": {
"label": "Position",
"format": "number"
},
"validationErrors": {
"label": "Errors",
"format": "object"
}
}
}
}
}
}
},
"meta": {
"templateId": "ts-empty"
},
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Validate dataset(s) with schema",
"description": "Validates one or more datasets against provided JSON schema.",
"type": "object",
"schemaVersion": 1,
"properties": {
"datasetIds": {
"type": "array",
"resourceType": "dataset",
"title": "Dataset ID(s)",
"description": "Provide ID(s} of datasets to validate again schema."
},
"schema": {
"type": "object",
"editor": "json",
"title": "JSON Schema",
"description": "The JSON schema to validate the dataset items against.",
"prefill": {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
},
"required": []
}
}
},
"required": ["datasetIds", "schema"]
}

src/dataset_utils.ts

1import { type ApifyClient } from 'apify';
2
3export async function maybeTranslateDatasetId(datasetId: string, client: ApifyClient) {
4 if (!datasetId) throw new Error('Dataset ID is required.');
5 const dataset = await client.dataset(datasetId).get();
6
7 // If dataset was found the ID is valid and we do not need to do anything else
8 if (dataset) return dataset.id;
9
10 // The dataset was not found. Let's check if maybe the user provided Run ID
11 const run = await client.run(datasetId).get();
12 if (run) return run.defaultDatasetId;
13
14 throw new Error('Dataset not found');
15}

src/main.ts

1import { Actor, log } from 'apify';
2
3import { getValidator } from './validator.js';
4import { maybeTranslateDatasetId } from './dataset_utils.js';
5
6await Actor.init();
7
8const input = await Actor.getInput<{
9 datasetIds: string[];
10 schema: Record<string, unknown>;
11}>();
12
13if (input === null || !input.datasetIds || !input.schema) {
14 throw new Error('Both datasetIds and schema are required.');
15}
16
17const validator = await getValidator(input.schema);
18
19const client = await Actor.apifyClient;
20
21const defaultDataset = await Actor.openDataset();
22
23for (const datasetId of input.datasetIds) {
24 let invalidItems: Array<{
25 datasetId: string;
26 itemPosition: number;
27 validationErrors: unknown;
28 }> = [];
29
30 try {
31 log.info(`Checking existence of dataset with ID/Name: ${datasetId}`);
32
33 const realDatasetId = await maybeTranslateDatasetId(datasetId, client);
34
35 log.info(`Validating dataset with ID: ${realDatasetId}`);
36
37 const dataset = await Actor.openDataset(realDatasetId, { forceCloud: true });
38
39 const info = await dataset.getInfo();
40 if (!info) throw new Error('Dataset not found!');
41
42 log.info(`Dataset contains ${info.itemCount} items`);
43 if (info.itemCount > 100000) {
44 log.warning('Dataset is very large, validation may take a long time.');
45 }
46
47 let invalidItemsCount = 0;
48
49 await dataset.forEach(async (item, index) => {
50 const isValid = validator(item);
51 if (isValid) return;
52
53 invalidItemsCount++;
54
55 invalidItems.push({
56 datasetId,
57 itemPosition: index,
58 validationErrors: validator.errors!,
59 });
60
61 if (invalidItems.length === 1000) {
62 await defaultDataset.pushData(invalidItems);
63 invalidItems = [];
64 }
65 });
66
67 // Push any remaining items
68 if (invalidItems.length > 0) {
69 try {
70 await defaultDataset.pushData(invalidItems);
71 } catch (error) {
72 // @ts-expect-error We are using properties we are not sure exist
73 log.error('push failed', { error: error.message, data: error.data?.invalidItems?.[0]?.validationErrors });
74 }
75 }
76
77 log.info(`Found ${invalidItemsCount} invalid items in the dataset with ID: ${datasetId}, errors are stored in default dataset of this run.`);
78 } catch (error) {
79 log.exception(error as Error, 'Failed to get dataset', { datasetId });
80 await Actor.fail(`Failed to get dataset with ID: ${datasetId}`);
81 }
82}
83
84// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
85await Actor.exit();

src/validator.ts

1import { Actor, log } from 'apify';
2import Ajv from 'ajv';
3
4export async function getValidator(schema: Record<string, unknown>) {
5 // Check if the schema is valid JSON schema and initialize the validator
6 // eslint-disable-next-line @typescript-eslint/ban-ts-comment
7 // @ts-ignore Not sure what is the reason for the error...
8 const ajv = new Ajv({ strict: false, unicodeRegExp: false, allErrors: true });
9 let validator;
10 try {
11 validator = ajv.compile(schema);
12 } catch (error) {
13 log.exception(error as Error, 'Failed to create validator');
14 await Actor.fail('Invalid schema provided. Please provide a valid JSON schema.');
15 }
16
17 log.info('Initialized validator using provided schema');
18
19 return validator;
20}