Dataset(s) To Schema

Pricing

Pay per usage

Try for free

Go to Apify Store

Dataset(s) To Schema

Try for free

Takes a Dataset ID(s) and outputs a JSON schema of the contents of the dataset into key value store.

Pricing

Pay per usage

Rating

5.0

(1)

Developer

Zuzka Pelechová

Maintained by Community

Actor stats

Bookmarked

Total users

Monthly active users

25 days ago

Last modified

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./


# Run the image.
CMD npm start --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "dataset-to-schema",
    "title": "Empty JavaScript project",
    "description": "Empty project in JavaScript.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "js-empty"
    },
    "dockerfile": "./Dockerfile",
    "input": "./input_schema.json"
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "datasetIds": {
            "title": "Dataset IDs",
            "type": "array",
            "description": "IDs of the dataset for which we are trying to generate schema",
            "editor": "stringList"
        }
    },
    "required": ["datasetIds"]
}

src/main.js

1import { Actor } from 'apify';
2await Actor.init();
3
4const { datasetIds } = await Actor.getInput();
5
6const anyType = ["number", "string", "boolean", "object", "array"];
7
8// -------------------- existing schema typing --------------------
9function setType(field, currentType, newType) {
10    if (!currentType || currentType === anyType) {
11        field.type = newType;
12    } else if (Array.isArray(currentType) && !currentType.includes(newType)) {
13        field.type.push(newType);
14    } else if (currentType && currentType !== newType) {
15        field.type = [currentType, newType];
16    } else {
17        field.type = newType;
18    }
19}
20
21// -------------------- NEW: stats helpers --------------------
22function ensureStatsNode(statsLayer, key) {
23    if (!statsLayer[key]) {
24        statsLayer[key] = {
25            presentCount: 0,             // how many times key existed on parent object
26            nullCount: 0,                // how many times value was null
27            typeCounts: {},              // { string: 10, number: 5, array: 2, object: 7, boolean: 1 }
28            objectInstances: 0,          // how many times it was a plain object (non-null, non-array)
29            arrayObjectItemInstances: 0, // how many object elements inside arrays were parsed
30            properties: {},              // nested stats for object properties
31            items: { properties: {} },   // nested stats for array-of-object item properties
32        };
33    }
34    return statsLayer[key];
35}
36
37function incType(statsNode, typeName) {
38    statsNode.typeCounts[typeName] = (statsNode.typeCounts[typeName] || 0) + 1;
39}
40
41function toPercent(part, whole) {
42    if (!whole) return 0;
43    return Math.round((part / whole) * 10000) / 100; // 2 decimals
44}
45
46// Build a final report with percentages.
47// `denom` = how many parent objects were processed at this level.
48function finalizeStats(statsLayer, denom) {
49    const out = {};
50
51    for (const [key, node] of Object.entries(statsLayer)) {
52        const present = node.presentCount;
53
54        out[key] = {
55            presentCount: present,
56            presentPct: toPercent(present, denom),
57
58            nullCount: node.nullCount,
59            nullPct: toPercent(node.nullCount, present),
60
61            typeCounts: { ...node.typeCounts },
62            typePct: Object.fromEntries(
63                Object.entries(node.typeCounts).map(([t, c]) => [t, toPercent(c, present)])
64            ),
65        };
66
67        // Recurse into object properties (denominator = number of times we actually saw a plain object)
68        if (node.objectInstances > 0 && Object.keys(node.properties).length > 0) {
69            out[key].properties = finalizeStats(node.properties, node.objectInstances);
70            out[key].objectInstances = node.objectInstances;
71        }
72
73        // Recurse into array item object properties (denominator = number of object items we parsed)
74        if (node.arrayObjectItemInstances > 0 && Object.keys(node.items.properties).length > 0) {
75            out[key].items = {
76                objectItemInstances: node.arrayObjectItemInstances,
77                properties: finalizeStats(node.items.properties, node.arrayObjectItemInstances),
78            };
79        }
80    }
81
82    return out;
83}
84
85// -------------------- updated parseSchema: schema + stats together --------------------
86function parseSchema(keys, item, fieldsLayer, statsLayer, parentDenomCounter) {
87    // parentDenomCounter.count represents how many "parent objects" exist at this level.
88    // For root level, this equals total dataset items processed.
89    parentDenomCounter.count += 1;
90
91    keys.forEach((key) => {
92        const field = item[key];
93        const jsType = typeof field;
94
95        if (!fieldsLayer[key]) fieldsLayer[key] = {};
96        const currentType = fieldsLayer[key].type;
97
98        const statsNode = ensureStatsNode(statsLayer, key);
99        statsNode.presentCount += 1;
100
101        // Primitives (string/number/boolean/undefined/function/symbol/bigint)
102        if (jsType !== 'object') {
103            setType(fieldsLayer[key], currentType, jsType);
104            incType(statsNode, jsType);
105            return;
106        }
107
108        // null
109        if (field === null) {
110            if (!fieldsLayer[key].type) fieldsLayer[key].type = anyType;
111            fieldsLayer[key].nullable = true;
112
113            statsNode.nullCount += 1;
114            incType(statsNode, 'null');
115            return;
116        }
117
118        // Array
119        if (Array.isArray(field)) {
120            setType(fieldsLayer[key], currentType, 'array');
121            incType(statsNode, 'array');
122
123            if (!fieldsLayer[key].items) {
124                fieldsLayer[key].items = { type: 'object', properties: {} };
125            }
126
127            // Parse only plain-object elements (same as your current behavior)
128            field.forEach((el) => {
129                if (typeof el === 'object' && el !== null && !Array.isArray(el)) {
130                    statsNode.arrayObjectItemInstances += 1;
131                    parseSchema(
132                        Object.keys(el),
133                        el,
134                        fieldsLayer[key].items.properties,
135                        statsNode.items.properties,
136                        // For array-item objects, each object element acts as a "parent object" at that nested level.
137                        { count: 0 } // not used further here, but keeps signature consistent
138                    );
139                }
140            });
141
142            return;
143        }
144
145        // Plain object
146        setType(fieldsLayer[key], currentType, 'object');
147        incType(statsNode, 'object');
148
149        fieldsLayer[key].properties = fieldsLayer[key].properties || {};
150        statsNode.objectInstances += 1;
151
152        // For nested object properties, denom should be how many times we had this object
153        // We don't want to increment denom per property; we want denom per object instance.
154        // So we pass a counter that we won't use for counting here, since objectInstances is already tracked.
155        parseSchema(
156            Object.keys(field),
157            field,
158            fieldsLayer[key].properties,
159            statsNode.properties,
160            { count: 0 } // denom tracked by statsNode.objectInstances, not by this counter
161        );
162    });
163}
164
165// -------------------- Shared schema across all datasets --------------------
166const schema = {
167    "$schema": "http://json-schema.org/draft-07/schema#",
168    "type": "object",
169    "properties": {},
170    "required": []
171};
172
173// NEW: shared stats across all datasets
174const stats = {
175    rootObjects: 0, // total dataset items processed (across all datasets)
176    properties: {}
177};
178
179for (const datasetId of datasetIds) {
180    console.log(`Processing dataset: ${datasetId}`);
181    const dataset = await Actor.openDataset(datasetId);
182
183    try {
184        await dataset.forEach(async (item, index) => {
185            // Root denom = number of dataset items processed
186            stats.rootObjects += 1;
187
188            // Parse schema + stats for this item
189            parseSchema(
190                Object.keys(item),
191                item,
192                schema.properties,
193                stats.properties,
194                { count: 0 } // not used for root; we track rootObjects directly
195            );
196
197            if (index > 1000000) throw new Error('reached-limit');
198        });
199    } catch (err) {
200        if (err.message === 'reached-limit') {
201            console.log(`Dataset ${datasetId} contains more than 1,000,000 items. Schema might not be perfect.`);
202        } else {
203            throw err;
204        }
205    }
206}
207
208// NEW: compute final stats with percentages
209const statsReport = {
210    rootObjects: stats.rootObjects,
211    properties: finalizeStats(stats.properties, stats.rootObjects),
212};
213
214console.log('Storing combined schema under SCHEMA key in key-value store');
215await Actor.setValue('SCHEMA', schema);
216
217// NEW: store stats separately
218console.log('Storing field stats with percentages under STATS key in key-value store');
219await Actor.setValue('STATS', statsReport);
220
221// NEW: push both to dataset (so output includes percentages too)
222await Actor.pushData({ schema, stats: statsReport });
223
224await Actor.exit();

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
node_modules
storage

package.json

{
    "name": "js-empty-project",
    "version": "0.0.1",
    "type": "module",
    "description": "This is a boilerplate of an Apify Actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.1.10",
        "crawlee": "^3.5.4"
    },
    "scripts": {
        "start": "node ./src/main.js",
        "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
        "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

Dataset Schema Super Actor

zuzka/dataset-schema-super-actor

Create your Actor dataset schema with one click.

Zuzka Pelechová

Validate Dataset(s) with JSON Schema

jaroslavhejlek/validate-dataset-with-json-schema

This Actor validates items in one or more datasets against a provided JSON Schema. Use it if you planning to add a dataset validation schema to your actor and you want test it.

Jaroslav Hejlek

Dataset Download

idiatech/apify-Dataset-Download

Download any dataset from the Apify platform automatically and in any format you want. Use this actor along with a Dataset toolbox automation tool.

idIA Tech

Forward Dataset to Actor or Task

valek.josef/forward-dataset-to-actor-or-task

Forwards contents of specified dataset to a specified field on the input of another Actor or task.

Josef Válek

Schema Universal Converter

fiery_dream/schema-universal-converter

Convert between JSON Schema, TypeScript, Zod, OpenAPI, GraphQL, and more. Maintain schema consistency across your entire stack.

Cody Churchwell

Google Dataset Items Translator

web.harvester/google-dataset-items-translator

Translate any dataset field(s) to any of the supported languages using the Google Translate website, it goes through all the items in the dataset and translates all of the selected fields

Web Harvester

Results Checker

lukaskrivka/results-checker

Check the results of your scrapers with this flexible checker. Just supply a dataset or key-value store ID and a few simple rules to get a detailed report.

Lukáš Křivka

Zip Key-value Store

jaroslavhejlek/zip-key-value-store

Takes the ID of the key-value store, archives all their keys into a zip file, and saves them into the key-value store of the actor. For more than 1000 keys, multiple zip files are created. If their total size is bigger than the actor's available memory, it creates multiple smaller zip files.

Jaroslav Hejlek

182