Dataset(s) To Schema
Pricing
Pay per usage
Go to Apify Store
Dataset(s) To Schema
Takes a Dataset ID(s) and outputs a JSON schema of the contents of the dataset into key value store.
0.0 (0)
Pricing
Pay per usage
0
1
0
Last modified
19 hours ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "dataset-to-schema", "title": "Empty JavaScript project", "description": "Empty project in JavaScript.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "js-empty" }, "dockerfile": "./Dockerfile", "input": "./input_schema.json"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "datasetIds": { "title": "Dataset IDs", "type": "array", "description": "IDs of the dataset for which we are trying to generate schema", "editor": "stringList" } }, "required": ["datasetIds"]}
src/main.js
1import { Actor } from 'apify';2await Actor.init();3
4const { datasetIds } = await Actor.getInput();5
6const anyType = ["number", "string", "boolean", "object", "array"];7
8function setType(field, currentType, newType) {9 if (!currentType || currentType === anyType) {10 field.type = newType;11 } else if (Array.isArray(currentType) && !currentType.includes(newType)) {12 field.type.push(newType);13 } else if (currentType && currentType !== newType) {14 field.type = [currentType, newType];15 } else {16 field.type = newType;17 }18}19
20function parseSchema(keys, item, fieldsLayer) {21 keys.forEach((key) => {22 const field = item[key];23 const type = typeof field;24
25 if (!fieldsLayer[key]) fieldsLayer[key] = {};26 const currentType = fieldsLayer[key].type;27
28 if (type !== 'object') {29 setType(fieldsLayer[key], currentType, type);30 return;31 }32
33 if (field === null) {34 if (!fieldsLayer[key].type) fieldsLayer[key].type = anyType;35 fieldsLayer[key].nullable = true;36 return;37 }38
39 if (Array.isArray(field)) {40 setType(fieldsLayer[key], currentType, 'array');41
42 const array = field;43
44 if (!fieldsLayer[key].items) {45 fieldsLayer[key].items = { type: 'object', properties: {} };46 }47
48 array.forEach((item) => {49 if (typeof item === 'object' && item !== null && !Array.isArray(item)) {50 parseSchema(Object.keys(item), item, fieldsLayer[key].items.properties);51 }52 });53
54 return;55 }56
57 // Field is a plain object58 setType(fieldsLayer[key], currentType, 'object');59 fieldsLayer[key].properties = fieldsLayer[key].properties || {};60 parseSchema(Object.keys(field), field, fieldsLayer[key].properties);61 });62}63
64// Shared schema across all datasets65const schema = {66 "$schema": "http://json-schema.org/draft-07/schema#",67 "type": "object",68 "properties": {},69 "required": []70};71
72for (const datasetId of datasetIds) {73 console.log(`Processing dataset: ${datasetId}`);74 const dataset = await Actor.openDataset(datasetId);75
76 try {77 await dataset.forEach(async (item, index) => {78 parseSchema(Object.keys(item), item, schema.properties);79 if (index > 1000000) throw new Error('reached-limit');80 });81 } catch (err) {82 if (err.message === 'reached-limit') {83 console.log(`Dataset ${datasetId} contains more than 1,000,000 items. Schema might not be perfect.`);84 } else {85 throw err;86 }87 }88}89
90console.log('Storing combined schema under SCHEMA key in key-value store');91await Actor.setValue('SCHEMA', schema);92
93await Actor.exit();
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideanode_modulesstorage
package.json
{ "name": "js-empty-project", "version": "0.0.1", "type": "module", "description": "This is a boilerplate of an Apify Actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.1.10", "crawlee": "^3.5.4" }, "scripts": { "start": "node ./src/main.js", "lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx", "lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}