1import { Actor } from 'apify';
2await Actor.init();
3
4const { datasetIds } = await Actor.getInput();
5
6const anyType = ["number", "string", "boolean", "object", "array"];
7
8function setType(field, currentType, newType) {
9 if (!currentType || currentType === anyType) {
10 field.type = newType;
11 } else if (Array.isArray(currentType) && !currentType.includes(newType)) {
12 field.type.push(newType);
13 } else if (currentType && currentType !== newType) {
14 field.type = [currentType, newType];
15 } else {
16 field.type = newType;
17 }
18}
19
20function parseSchema(keys, item, fieldsLayer) {
21 keys.forEach((key) => {
22 const field = item[key];
23 const type = typeof field;
24
25 if (!fieldsLayer[key]) fieldsLayer[key] = {};
26 const currentType = fieldsLayer[key].type;
27
28 if (type !== 'object') {
29 setType(fieldsLayer[key], currentType, type);
30 return;
31 }
32
33 if (field === null) {
34 if (!fieldsLayer[key].type) fieldsLayer[key].type = anyType;
35 fieldsLayer[key].nullable = true;
36 return;
37 }
38
39 if (Array.isArray(field)) {
40 setType(fieldsLayer[key], currentType, 'array');
41
42 const array = field;
43
44 if (!fieldsLayer[key].items) {
45 fieldsLayer[key].items = { type: 'object', properties: {} };
46 }
47
48 array.forEach((item) => {
49 if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
50 parseSchema(Object.keys(item), item, fieldsLayer[key].items.properties);
51 }
52 });
53
54 return;
55 }
56
57
58 setType(fieldsLayer[key], currentType, 'object');
59 fieldsLayer[key].properties = fieldsLayer[key].properties || {};
60 parseSchema(Object.keys(field), field, fieldsLayer[key].properties);
61 });
62}
63
64
65const schema = {
66 "$schema": "http://json-schema.org/draft-07/schema#",
67 "type": "object",
68 "properties": {},
69 "required": []
70};
71
72for (const datasetId of datasetIds) {
73 console.log(`Processing dataset: ${datasetId}`);
74 const dataset = await Actor.openDataset(datasetId);
75
76 try {
77 await dataset.forEach(async (item, index) => {
78 parseSchema(Object.keys(item), item, schema.properties);
79 if (index > 1000000) throw new Error('reached-limit');
80 });
81 } catch (err) {
82 if (err.message === 'reached-limit') {
83 console.log(`Dataset ${datasetId} contains more than 1,000,000 items. Schema might not be perfect.`);
84 } else {
85 throw err;
86 }
87 }
88}
89
90console.log('Storing combined schema under SCHEMA key in key-value store');
91await Actor.setValue('SCHEMA', schema);
92await Actor.pushData(schema);
93
94await Actor.exit();