
Validate Dataset(s) with JSON Schema
Pricing
Pay per usage
Go to Store

Validate Dataset(s) with JSON Schema
This Actor validates items in one or more datasets against a provided JSON Schema. Use it if you planning to add a dataset validation schema to your actor and you want test it.
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Runs succeeded
50%
Last modified
a month ago
.dockerignore
# configurations.idea.vscode
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
# dist folderdist
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "root": true, "env": { "browser": true, "es2020": true, "node": true }, "extends": [ "@apify/eslint-config-ts" ], "parserOptions": { "project": "./tsconfig.json", "ecmaVersion": 2020 }, "ignorePatterns": [ "node_modules", "dist", "**/*.d.ts" ]}
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.vscodestorageapify_storagecrawlee_storagenode_modulesdisttsconfig.tsbuildinfo
# Added by Apify CLI.venv
.nvmrc
v20.13.1
package.json
{ "name": "validate-dataset-with-json-schema", "version": "0.0.1", "type": "module", "description": "Validates one or more datasets against provided JSON schema.", "engines": { "node": ">=18.0.0" }, "dependencies": { "ajv": "^8.6.2", "apify": "^3.2.6" }, "devDependencies": { "@apify/eslint-config-ts": "^0.3.0", "@apify/tsconfig": "^0.1.0", "@typescript-eslint/eslint-plugin": "^7.18.0", "@typescript-eslint/parser": "^7.18.0", "eslint": "^8.50.0", "tsx": "^4.6.2", "typescript": "^5.3.3" }, "scripts": { "start": "npm run start:dev", "start:prod": "node dist/main.js", "start:dev": "tsx src/main.ts", "build": "tsc", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}
tsconfig.json
{ "extends": "@apify/tsconfig", "compilerOptions": { "module": "NodeNext", "moduleResolution": "NodeNext", "target": "ES2022", "outDir": "dist", "noUnusedLocals": false, "skipLibCheck": true, "lib": ["DOM"], }, "include": [ "./src/**/*" ]}
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20 AS builder
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set# in the base image.COPY . ./
# Install all dependencies and build the project.# Don't audit to speed up the installation.RUN npm run build
# Create final imageFROM apify/actor-node:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Copy built JS files from builder imageCOPY /usr/src/app/dist ./dist
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Create and run as a non-root user.RUN adduser -h /home/apify -D apify && \ chown -R apify:apify ./USER apify
# Run the image.CMD npm run start:prod --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "validate-dataset-with-json-schema", "title": "Validate dataset(s) with schema", "description": "Validates one or more datasets against provided JSON schema.", "version": "0.0", "buildTag": "latest", "input": "input_schema.json", "storages": { "dataset": { "actorSpecification": 1, "fields": { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "datasetId": { "type": "string" }, "itemPosition": { "type": "number" }, "validationErrors": { "type": "array", "items": { "type": "object", "additionalProperties": true } } }, "required": ["datasetId", "itemPosition", "validationErrors"] }, "views": { "overview": { "title": "Overview", "display": { "component": "table", "properties": { "datasetId": { "label": "Dataset ID", "format": "text" }, "itemPosition": { "label": "Position", "format": "number" }, "validationErrors": { "label": "Errors", "format": "object" } } } } } } }, "meta": { "templateId": "ts-empty" }, "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Validate dataset(s) with schema", "description": "Validates one or more datasets against provided JSON schema.", "type": "object", "schemaVersion": 1, "properties": { "datasetIds": { "type": "array", "resourceType": "dataset", "title": "Dataset ID(s)", "description": "Provide ID(s} of datasets to validate again schema." }, "schema": { "type": "object", "editor": "json", "title": "JSON Schema", "description": "The JSON schema to validate the dataset items against.", "prefill": { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {
}, "required": [] } } }, "required": ["datasetIds", "schema"]}
src/dataset_utils.ts
1import { type ApifyClient } from 'apify';2
3export async function maybeTranslateDatasetId(datasetId: string, client: ApifyClient) {4 if (!datasetId) throw new Error('Dataset ID is required.');5 const dataset = await client.dataset(datasetId).get();6
7 // If dataset was found the ID is valid and we do not need to do anything else8 if (dataset) return dataset.id;9
10 // The dataset was not found. Let's check if maybe the user provided Run ID11 const run = await client.run(datasetId).get();12 if (run) return run.defaultDatasetId;13
14 throw new Error('Dataset not found');15}
src/main.ts
1import { Actor, log } from 'apify';2
3import { getValidator } from './validator.js';4import { maybeTranslateDatasetId } from './dataset_utils.js';5
6await Actor.init();7
8const input = await Actor.getInput<{9 datasetIds: string[];10 schema: Record<string, unknown>;11}>();12
13if (input === null || !input.datasetIds || !input.schema) {14 throw new Error('Both datasetIds and schema are required.');15}16
17const validator = await getValidator(input.schema);18
19const client = await Actor.apifyClient;20
21const defaultDataset = await Actor.openDataset();22
23for (const datasetId of input.datasetIds) {24 let invalidItems: Array<{25 datasetId: string;26 itemPosition: number;27 validationErrors: unknown;28 }> = [];29
30 try {31 log.info(`Checking existence of dataset with ID/Name: ${datasetId}`);32
33 const realDatasetId = await maybeTranslateDatasetId(datasetId, client);34
35 log.info(`Validating dataset with ID: ${realDatasetId}`);36
37 const dataset = await Actor.openDataset(realDatasetId, { forceCloud: true });38
39 const info = await dataset.getInfo();40 if (!info) throw new Error('Dataset not found!');41
42 log.info(`Dataset contains ${info.itemCount} items`);43 if (info.itemCount > 100000) {44 log.warning('Dataset is very large, validation may take a long time.');45 }46
47 let invalidItemsCount = 0;48
49 await dataset.forEach(async (item, index) => {50 const isValid = validator(item);51 if (isValid) return;52
53 invalidItemsCount++;54
55 invalidItems.push({56 datasetId,57 itemPosition: index,58 validationErrors: validator.errors!,59 });60
61 if (invalidItems.length === 1000) {62 await defaultDataset.pushData(invalidItems);63 invalidItems = [];64 }65 });66
67 // Push any remaining items68 if (invalidItems.length > 0) {69 try {70 await defaultDataset.pushData(invalidItems);71 } catch (error) {72 // @ts-expect-error We are using properties we are not sure exist73 log.error('push failed', { error: error.message, data: error.data?.invalidItems?.[0]?.validationErrors });74 }75 }76
77 log.info(`Found ${invalidItemsCount} invalid items in the dataset with ID: ${datasetId}, errors are stored in default dataset of this run.`);78 } catch (error) {79 log.exception(error as Error, 'Failed to get dataset', { datasetId });80 await Actor.fail(`Failed to get dataset with ID: ${datasetId}`);81 }82}83
84// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()85await Actor.exit();
src/validator.ts
1import { Actor, log } from 'apify';2import Ajv from 'ajv';3
4export async function getValidator(schema: Record<string, unknown>) {5 // Check if the schema is valid JSON schema and initialize the validator6 // eslint-disable-next-line @typescript-eslint/ban-ts-comment7 // @ts-ignore Not sure what is the reason for the error...8 const ajv = new Ajv({ strict: false, unicodeRegExp: false, allErrors: true });9 let validator;10 try {11 validator = ajv.compile(schema);12 } catch (error) {13 log.exception(error as Error, 'Failed to create validator');14 await Actor.fail('Invalid schema provided. Please provide a valid JSON schema.');15 }16
17 log.info('Initialized validator using provided schema');18
19 return validator;20}