Datasets Compare
Try for free
No credit card required
View all Actors
Datasets Compare
petr_cermak/datasets-compare
Try for free
No credit card required
Act for comparing crawler execution results. By default the final result set will contain only new and updated records.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-basic:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
INPUT_SCHEMA.json
1{
2 "title": "Fructidor.fr scraper",
3 "description": "Fructidor.fr scraper",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "idAttr": {
8 "title": "ID attribute",
9 "type": "string",
10 "description": "ID attribute name",
11 "editor": "textfield"
12 },
13 "oldDataset": {
14 "title": "Old dataset ID",
15 "type": "string",
16 "description": "Old dataset ID",
17 "editor": "textfield"
18 },
19 "newDataset": {
20 "title": "New dataset ID",
21 "type": "string",
22 "description": "New dataset ID",
23 "editor": "textfield"
24 }
25 }
26}
main.js
1const Apify = require('apify');
2const _ = require('lodash');
3
4function createKey(result, idAttr){
5 return result ? (
6 Array.isArray(idAttr) ?
7 idAttr.map(ida => result[ida]).join('_') :
8 result[idAttr]
9 ) : null;
10}
11
12async function loadResults(datasetId, process, offset){
13 const limit = 10000;
14 if(!offset){offset = 0;}
15 const newItems = await Apify.client.datasets.getItems({
16 datasetId,
17 offset,
18 limit
19 });
20 if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
21 if(newItems.length){await process(newItems);}
22 else if(newItems.items && newItems.items.length){await process(newItems);}
23 await loadResults(datasetId, process, offset + limit);
24 }
25}
26
27async function createCompareMap(oldExecId, idAttr){
28 const data = {};
29 let processed = 0;
30 console.log('creating comparing map');
31 await loadResults(oldExecId, async (fullResults) => {
32 const results = _.chain(fullResults.items).flatten().value();
33 _.each(results, (result, index) => {
34 const key = createKey(result, idAttr);
35 if(key){data[key] = result;}
36 });
37 processed += results.length;
38 console.log('processed old results: ' + processed);
39 });
40 console.log('comparing map created');
41 return data;
42}
43
44async function compareResults(newExecId, compareMap, idAttr, settings){
45 let data = [];
46 let processed = 0, pushData = null;
47 let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
48
49 if(settings.useDataset){
50 pushData = async (value, flush) => {
51 if(!flush){data.push(value);}
52 if(data.length >= 100 || flush){
53 await Apify.pushData(data);
54 data = [];
55 }
56 };
57 }
58 else{pushData = async value => data.push(value);}
59
60 console.log('comparing results');
61 await loadResults(newExecId, async (fullResults) => {
62 const results = _.chain(fullResults.items).flatten().value();
63 for(const result of results){
64 const id = createKey(result, idAttr);
65 if(id){
66 const oldResult = compareMap ? compareMap[id] : null;
67 if(!oldResult){
68 if(settings.addStatus){result[settings.statusAttr] = 'NEW';}
69 if(settings.returnNew){await pushData(result);}//data.push(result);}
70 newCount++;
71 }
72 else if(!_.isEqual(result, oldResult)){
73 const addUpdated = async function(changes){
74 if(settings.addStatus){result[settings.statusAttr] = 'UPDATED';}
75 if(settings.returnUpd){
76 if(settings.addChanges){
77 const tChanges = changes || getChangeAttributes(oldResult, result);
78 result[settings.changesAttr] = settings.stringifyChanges ? tChanges.join(', ') : tChanges;
79 }
80 await pushData(result);//data.push(result);
81 }
82 updCount++;
83 }
84 if(settings.updatedIf){
85 const changes = getChangeAttributes(oldResult, result);
86 const intersection = _.intersection(settings.updatedIf, changes);
87 if(!intersection.length){
88 if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
89 if(settings.returnUnc){await pushData(result);}//data.push(result);}
90 uncCount++;
91 }
92 else{await addUpdated(intersection);}
93 }
94 else{await addUpdated();}
95 }
96 else{
97 if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
98 if(settings.returnUnc){await pushData(result);}//data.push(result);}
99 uncCount++;
100 }
101 if(compareMap){delete compareMap[id];}
102 }
103 else{console.log('record is missing id (' + idAttr + '): ' + JSON.stringify(result));}
104 }
105 processed += results.length;
106 console.log('compared new results: ' + processed);
107 });
108 console.log('comparing results finished');
109
110 if(compareMap && settings.returnDel){
111 console.log('processing deleted results');
112 const values = Object.values(compareMap);
113 for(const oldResult of values){
114 if(settings.addStatus){oldResult[settings.statusAttr] = 'DELETED';}
115 await pushData(oldResult);//data.push(oldResult);
116 delCount++;
117 }
118 console.log('processing deleted results finished');
119 }
120
121 console.log('new: ' + newCount + ', updated: ' + updCount +
122 (settings.returnDel ? (', deleted: ' + delCount) : '') +
123 ', unchanged: ' + uncCount);
124 if(!settings.useDataset){return data;}
125 else{pushData(null, true);}
126}
127
128function getChangeAttributes(obj1, obj2, prefix, out){
129 const changes = out ? out : [];
130 if(obj1){
131 for(const key in obj1){
132 const v1 = obj1[key];
133 const v2 = obj2 ? obj2[key] : null;
134 if(!_.isEqual(v1, v2)){
135 if(v1 !== null && typeof v1 === 'object'){
136 getChangeAttributes(v1, v2, key + '/', changes);
137 }
138 else{changes.push(prefix ? prefix + key : key);}
139 }
140 }
141 }
142 return changes;
143}
144
145Apify.main(async () => {
146 const input = await Apify.getValue('INPUT');
147
148 const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : input;
149 if(!data.idAttr){
150 throw new Error('missing "idAttr" attribute in INPUT');
151 }
152 if(!data.oldDataset){
153 return console.log('warning: missing "oldDataset" attribute in INPUT, all results will be identified as NEW');
154 }
155 if(!data.newDataset){
156 throw new Error('missing "newDataset" attribute in INPUT');
157 }
158
159 if(data.token){Apify.client.setOptions({token: data.token});}
160 if(data.userId){Apify.client.setOptions({userId: data.userId});}
161
162 const settings = {};
163 data.return = data.return || 'new, updated';
164 settings.returnNew = data.return.match(/new/i);
165 settings.returnUpd = data.return.match(/updated/i);
166 settings.returnDel = data.return.match(/deleted/i);
167 settings.returnUnc = data.return.match(/unchanged/i);
168 settings.addStatus = data.addStatus ? true : false;
169 settings.addChanges = data.addChanges ? true : false;
170 settings.statusAttr = data.statusAttr ? data.statusAttr : 'status';
171 settings.changesAttr = data.changesAttr ? data.changesAttr : 'changes';
172 settings.stringifyChanges = data.stringifyChanges;
173 settings.updatedIf = data.updatedIf;
174 settings.useDataset = data.useDataset;
175
176 const compareMap = data.oldDataset ? (await createCompareMap(data.oldDataset, data.idAttr)) : null;
177 const resultData = await compareResults(input._id || data.newDataset, compareMap, data.idAttr, settings);
178
179 if(resultData){await Apify.setValue('OUTPUT', resultData);}
180 console.log('finished');
181});
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10",
9 "lodash": "latest",
10 "bluebird": "latest"
11 },
12 "scripts": {
13 "start": "node main.js"
14 }
15}
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
3 stars
0% runs succeeded
Created in Apr 2018
Modified 3 years ago
Categories