Datasets Compare avatar
Datasets Compare

Pricing

Pay per usage

Go to Store
Datasets Compare

Datasets Compare

Developed by

Petr Cermak

Petr Cermak

Maintained by Community

Act for comparing crawler execution results. By default the final result set will contain only new and updated records.

0.0 (0)

Pricing

Pay per usage

4

Total users

21

Monthly users

1

Runs succeeded

0%

Last modified

3 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-basic:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

INPUT_SCHEMA.json

{
"title": "Fructidor.fr scraper",
"description": "Fructidor.fr scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"idAttr": {
"title": "ID attribute",
"type": "string",
"description": "ID attribute name",
"editor": "textfield"
},
"oldDataset": {
"title": "Old dataset ID",
"type": "string",
"description": "Old dataset ID",
"editor": "textfield"
},
"newDataset": {
"title": "New dataset ID",
"type": "string",
"description": "New dataset ID",
"editor": "textfield"
}
}
}

main.js

1const Apify = require('apify');
2const _ = require('lodash');
3
4function createKey(result, idAttr){
5 return result ? (
6 Array.isArray(idAttr) ?
7 idAttr.map(ida => result[ida]).join('_') :
8 result[idAttr]
9 ) : null;
10}
11
12async function loadResults(datasetId, process, offset){
13 const limit = 10000;
14 if(!offset){offset = 0;}
15 const newItems = await Apify.client.datasets.getItems({
16 datasetId,
17 offset,
18 limit
19 });
20 if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
21 if(newItems.length){await process(newItems);}
22 else if(newItems.items && newItems.items.length){await process(newItems);}
23 await loadResults(datasetId, process, offset + limit);
24 }
25}
26
27async function createCompareMap(oldExecId, idAttr){
28 const data = {};
29 let processed = 0;
30 console.log('creating comparing map');
31 await loadResults(oldExecId, async (fullResults) => {
32 const results = _.chain(fullResults.items).flatten().value();
33 _.each(results, (result, index) => {
34 const key = createKey(result, idAttr);
35 if(key){data[key] = result;}
36 });
37 processed += results.length;
38 console.log('processed old results: ' + processed);
39 });
40 console.log('comparing map created');
41 return data;
42}
43
44async function compareResults(newExecId, compareMap, idAttr, settings){
45 let data = [];
46 let processed = 0, pushData = null;
47 let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
48
49 if(settings.useDataset){
50 pushData = async (value, flush) => {
51 if(!flush){data.push(value);}
52 if(data.length >= 100 || flush){
53 await Apify.pushData(data);
54 data = [];
55 }
56 };
57 }
58 else{pushData = async value => data.push(value);}
59
60 console.log('comparing results');
61 await loadResults(newExecId, async (fullResults) => {
62 const results = _.chain(fullResults.items).flatten().value();
63 for(const result of results){
64 const id = createKey(result, idAttr);
65 if(id){
66 const oldResult = compareMap ? compareMap[id] : null;
67 if(!oldResult){
68 if(settings.addStatus){result[settings.statusAttr] = 'NEW';}
69 if(settings.returnNew){await pushData(result);}//data.push(result);}
70 newCount++;
71 }
72 else if(!_.isEqual(result, oldResult)){
73 const addUpdated = async function(changes){
74 if(settings.addStatus){result[settings.statusAttr] = 'UPDATED';}
75 if(settings.returnUpd){
76 if(settings.addChanges){
77 const tChanges = changes || getChangeAttributes(oldResult, result);
78 result[settings.changesAttr] = settings.stringifyChanges ? tChanges.join(', ') : tChanges;
79 }
80 await pushData(result);//data.push(result);
81 }
82 updCount++;
83 }
84 if(settings.updatedIf){
85 const changes = getChangeAttributes(oldResult, result);
86 const intersection = _.intersection(settings.updatedIf, changes);
87 if(!intersection.length){
88 if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
89 if(settings.returnUnc){await pushData(result);}//data.push(result);}
90 uncCount++;
91 }
92 else{await addUpdated(intersection);}
93 }
94 else{await addUpdated();}
95 }
96 else{
97 if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
98 if(settings.returnUnc){await pushData(result);}//data.push(result);}
99 uncCount++;
100 }
101 if(compareMap){delete compareMap[id];}
102 }
103 else{console.log('record is missing id (' + idAttr + '): ' + JSON.stringify(result));}
104 }
105 processed += results.length;
106 console.log('compared new results: ' + processed);
107 });
108 console.log('comparing results finished');
109
110 if(compareMap && settings.returnDel){
111 console.log('processing deleted results');
112 const values = Object.values(compareMap);
113 for(const oldResult of values){
114 if(settings.addStatus){oldResult[settings.statusAttr] = 'DELETED';}
115 await pushData(oldResult);//data.push(oldResult);
116 delCount++;
117 }
118 console.log('processing deleted results finished');
119 }
120
121 console.log('new: ' + newCount + ', updated: ' + updCount +
122 (settings.returnDel ? (', deleted: ' + delCount) : '') +
123 ', unchanged: ' + uncCount);
124 if(!settings.useDataset){return data;}
125 else{pushData(null, true);}
126}
127
128function getChangeAttributes(obj1, obj2, prefix, out){
129 const changes = out ? out : [];
130 if(obj1){
131 for(const key in obj1){
132 const v1 = obj1[key];
133 const v2 = obj2 ? obj2[key] : null;
134 if(!_.isEqual(v1, v2)){
135 if(v1 !== null && typeof v1 === 'object'){
136 getChangeAttributes(v1, v2, key + '/', changes);
137 }
138 else{changes.push(prefix ? prefix + key : key);}
139 }
140 }
141 }
142 return changes;
143}
144
145Apify.main(async () => {
146 const input = await Apify.getValue('INPUT');
147
148 const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : input;
149 if(!data.idAttr){
150 throw new Error('missing "idAttr" attribute in INPUT');
151 }
152 if(!data.oldDataset){
153 return console.log('warning: missing "oldDataset" attribute in INPUT, all results will be identified as NEW');
154 }
155 if(!data.newDataset){
156 throw new Error('missing "newDataset" attribute in INPUT');
157 }
158
159 if(data.token){Apify.client.setOptions({token: data.token});}
160 if(data.userId){Apify.client.setOptions({userId: data.userId});}
161
162 const settings = {};
163 data.return = data.return || 'new, updated';
164 settings.returnNew = data.return.match(/new/i);
165 settings.returnUpd = data.return.match(/updated/i);
166 settings.returnDel = data.return.match(/deleted/i);
167 settings.returnUnc = data.return.match(/unchanged/i);
168 settings.addStatus = data.addStatus ? true : false;
169 settings.addChanges = data.addChanges ? true : false;
170 settings.statusAttr = data.statusAttr ? data.statusAttr : 'status';
171 settings.changesAttr = data.changesAttr ? data.changesAttr : 'changes';
172 settings.stringifyChanges = data.stringifyChanges;
173 settings.updatedIf = data.updatedIf;
174 settings.useDataset = data.useDataset;
175
176 const compareMap = data.oldDataset ? (await createCompareMap(data.oldDataset, data.idAttr)) : null;
177 const resultData = await compareResults(input._id || data.newDataset, compareMap, data.idAttr, settings);
178
179 if(resultData){await Apify.setValue('OUTPUT', resultData);}
180 console.log('finished');
181});

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10",
"lodash": "latest",
"bluebird": "latest"
},
"scripts": {
"start": "node main.js"
}
}