Dataset(s) To Schema avatar
Dataset(s) To Schema

Pricing

Pay per usage

Go to Apify Store
Dataset(s) To Schema

Dataset(s) To Schema

Developed by

Zuzka Pelechová

Zuzka Pelechová

Maintained by Community

Takes a Dataset ID(s) and outputs a JSON schema of the contents of the dataset into key value store.

0.0 (0)

Pricing

Pay per usage

0

1

0

Last modified

19 hours ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:20
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "dataset-to-schema",
"title": "Empty JavaScript project",
"description": "Empty project in JavaScript.",
"version": "0.0",
"buildTag": "latest",
"meta": {
"templateId": "js-empty"
},
"dockerfile": "./Dockerfile",
"input": "./input_schema.json"
}

.actor/input_schema.json

{
"title": "Scrape data from a web page",
"type": "object",
"schemaVersion": 1,
"properties": {
"datasetIds": {
"title": "Dataset IDs",
"type": "array",
"description": "IDs of the dataset for which we are trying to generate schema",
"editor": "stringList"
}
},
"required": ["datasetIds"]
}

src/main.js

1import { Actor } from 'apify';
2await Actor.init();
3
4const { datasetIds } = await Actor.getInput();
5
6const anyType = ["number", "string", "boolean", "object", "array"];
7
8function setType(field, currentType, newType) {
9 if (!currentType || currentType === anyType) {
10 field.type = newType;
11 } else if (Array.isArray(currentType) && !currentType.includes(newType)) {
12 field.type.push(newType);
13 } else if (currentType && currentType !== newType) {
14 field.type = [currentType, newType];
15 } else {
16 field.type = newType;
17 }
18}
19
20function parseSchema(keys, item, fieldsLayer) {
21 keys.forEach((key) => {
22 const field = item[key];
23 const type = typeof field;
24
25 if (!fieldsLayer[key]) fieldsLayer[key] = {};
26 const currentType = fieldsLayer[key].type;
27
28 if (type !== 'object') {
29 setType(fieldsLayer[key], currentType, type);
30 return;
31 }
32
33 if (field === null) {
34 if (!fieldsLayer[key].type) fieldsLayer[key].type = anyType;
35 fieldsLayer[key].nullable = true;
36 return;
37 }
38
39 if (Array.isArray(field)) {
40 setType(fieldsLayer[key], currentType, 'array');
41
42 const array = field;
43
44 if (!fieldsLayer[key].items) {
45 fieldsLayer[key].items = { type: 'object', properties: {} };
46 }
47
48 array.forEach((item) => {
49 if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
50 parseSchema(Object.keys(item), item, fieldsLayer[key].items.properties);
51 }
52 });
53
54 return;
55 }
56
57 // Field is a plain object
58 setType(fieldsLayer[key], currentType, 'object');
59 fieldsLayer[key].properties = fieldsLayer[key].properties || {};
60 parseSchema(Object.keys(field), field, fieldsLayer[key].properties);
61 });
62}
63
64// Shared schema across all datasets
65const schema = {
66 "$schema": "http://json-schema.org/draft-07/schema#",
67 "type": "object",
68 "properties": {},
69 "required": []
70};
71
72for (const datasetId of datasetIds) {
73 console.log(`Processing dataset: ${datasetId}`);
74 const dataset = await Actor.openDataset(datasetId);
75
76 try {
77 await dataset.forEach(async (item, index) => {
78 parseSchema(Object.keys(item), item, schema.properties);
79 if (index > 1000000) throw new Error('reached-limit');
80 });
81 } catch (err) {
82 if (err.message === 'reached-limit') {
83 console.log(`Dataset ${datasetId} contains more than 1,000,000 items. Schema might not be perfect.`);
84 } else {
85 throw err;
86 }
87 }
88}
89
90console.log('Storing combined schema under SCHEMA key in key-value store');
91await Actor.setValue('SCHEMA', schema);
92
93await Actor.exit();

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
node_modules
storage

package.json

{
"name": "js-empty-project",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify Actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.1.10",
"crawlee": "^3.5.4"
},
"scripts": {
"start": "node ./src/main.js",
"lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
"lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}