1import { Actor } from 'apify';
2await Actor.init();
3
4const { datasetIds } = await Actor.getInput();
5
6const anyType = ["number", "string", "boolean", "object", "array"];
7
8
9function setType(field, currentType, newType) {
10 if (!currentType || currentType === anyType) {
11 field.type = newType;
12 } else if (Array.isArray(currentType) && !currentType.includes(newType)) {
13 field.type.push(newType);
14 } else if (currentType && currentType !== newType) {
15 field.type = [currentType, newType];
16 } else {
17 field.type = newType;
18 }
19}
20
21
22function ensureStatsNode(statsLayer, key) {
23 if (!statsLayer[key]) {
24 statsLayer[key] = {
25 presentCount: 0,
26 nullCount: 0,
27 typeCounts: {},
28 objectInstances: 0,
29 arrayObjectItemInstances: 0,
30 properties: {},
31 items: { properties: {} },
32 };
33 }
34 return statsLayer[key];
35}
36
37function incType(statsNode, typeName) {
38 statsNode.typeCounts[typeName] = (statsNode.typeCounts[typeName] || 0) + 1;
39}
40
41function toPercent(part, whole) {
42 if (!whole) return 0;
43 return Math.round((part / whole) * 10000) / 100;
44}
45
46
47
48function finalizeStats(statsLayer, denom) {
49 const out = {};
50
51 for (const [key, node] of Object.entries(statsLayer)) {
52 const present = node.presentCount;
53
54 out[key] = {
55 presentCount: present,
56 presentPct: toPercent(present, denom),
57
58 nullCount: node.nullCount,
59 nullPct: toPercent(node.nullCount, present),
60
61 typeCounts: { ...node.typeCounts },
62 typePct: Object.fromEntries(
63 Object.entries(node.typeCounts).map(([t, c]) => [t, toPercent(c, present)])
64 ),
65 };
66
67
68 if (node.objectInstances > 0 && Object.keys(node.properties).length > 0) {
69 out[key].properties = finalizeStats(node.properties, node.objectInstances);
70 out[key].objectInstances = node.objectInstances;
71 }
72
73
74 if (node.arrayObjectItemInstances > 0 && Object.keys(node.items.properties).length > 0) {
75 out[key].items = {
76 objectItemInstances: node.arrayObjectItemInstances,
77 properties: finalizeStats(node.items.properties, node.arrayObjectItemInstances),
78 };
79 }
80 }
81
82 return out;
83}
84
85
86function parseSchema(keys, item, fieldsLayer, statsLayer, parentDenomCounter) {
87
88
89 parentDenomCounter.count += 1;
90
91 keys.forEach((key) => {
92 const field = item[key];
93 const jsType = typeof field;
94
95 if (!fieldsLayer[key]) fieldsLayer[key] = {};
96 const currentType = fieldsLayer[key].type;
97
98 const statsNode = ensureStatsNode(statsLayer, key);
99 statsNode.presentCount += 1;
100
101
102 if (jsType !== 'object') {
103 setType(fieldsLayer[key], currentType, jsType);
104 incType(statsNode, jsType);
105 return;
106 }
107
108
109 if (field === null) {
110 if (!fieldsLayer[key].type) fieldsLayer[key].type = anyType;
111 fieldsLayer[key].nullable = true;
112
113 statsNode.nullCount += 1;
114 incType(statsNode, 'null');
115 return;
116 }
117
118
119 if (Array.isArray(field)) {
120 setType(fieldsLayer[key], currentType, 'array');
121 incType(statsNode, 'array');
122
123 if (!fieldsLayer[key].items) {
124 fieldsLayer[key].items = { type: 'object', properties: {} };
125 }
126
127
128 field.forEach((el) => {
129 if (typeof el === 'object' && el !== null && !Array.isArray(el)) {
130 statsNode.arrayObjectItemInstances += 1;
131 parseSchema(
132 Object.keys(el),
133 el,
134 fieldsLayer[key].items.properties,
135 statsNode.items.properties,
136
137 { count: 0 }
138 );
139 }
140 });
141
142 return;
143 }
144
145
146 setType(fieldsLayer[key], currentType, 'object');
147 incType(statsNode, 'object');
148
149 fieldsLayer[key].properties = fieldsLayer[key].properties || {};
150 statsNode.objectInstances += 1;
151
152
153
154
155 parseSchema(
156 Object.keys(field),
157 field,
158 fieldsLayer[key].properties,
159 statsNode.properties,
160 { count: 0 }
161 );
162 });
163}
164
165
166const schema = {
167 "$schema": "http://json-schema.org/draft-07/schema#",
168 "type": "object",
169 "properties": {},
170 "required": []
171};
172
173
174const stats = {
175 rootObjects: 0,
176 properties: {}
177};
178
179for (const datasetId of datasetIds) {
180 console.log(`Processing dataset: ${datasetId}`);
181 const dataset = await Actor.openDataset(datasetId);
182
183 try {
184 await dataset.forEach(async (item, index) => {
185
186 stats.rootObjects += 1;
187
188
189 parseSchema(
190 Object.keys(item),
191 item,
192 schema.properties,
193 stats.properties,
194 { count: 0 }
195 );
196
197 if (index > 1000000) throw new Error('reached-limit');
198 });
199 } catch (err) {
200 if (err.message === 'reached-limit') {
201 console.log(`Dataset ${datasetId} contains more than 1,000,000 items. Schema might not be perfect.`);
202 } else {
203 throw err;
204 }
205 }
206}
207
208
209const statsReport = {
210 rootObjects: stats.rootObjects,
211 properties: finalizeStats(stats.properties, stats.rootObjects),
212};
213
214console.log('Storing combined schema under SCHEMA key in key-value store');
215await Actor.setValue('SCHEMA', schema);
216
217
218console.log('Storing field stats with percentages under STATS key in key-value store');
219await Actor.setValue('STATS', statsReport);
220
221
222await Actor.pushData({ schema, stats: statsReport });
223
224await Actor.exit();