Datasets Compare avatar
Datasets Compare

Pricing

Pay per usage

Go to Store
Datasets Compare

Datasets Compare

Developed by

Petr Cermak

Maintained by Community

Act for comparing crawler execution results. By default the final result set will contain only new and updated records.

0.0 (0)

Pricing

Pay per usage

4

Monthly users

1

0

Last modified

3 years ago

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-basic:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY  . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

INPUT_SCHEMA.json

1{
2  "title": "Fructidor.fr scraper",
3  "description": "Fructidor.fr scraper",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "idAttr": {
8      "title": "ID attribute",
9      "type": "string",
10      "description": "ID attribute name",
11      "editor": "textfield"
12    },
13    "oldDataset": {
14      "title": "Old dataset ID",
15      "type": "string",
16      "description": "Old dataset ID",
17      "editor": "textfield"
18    },
19    "newDataset": {
20      "title": "New dataset ID",
21      "type": "string",
22      "description": "New dataset ID",
23      "editor": "textfield"
24    }
25  }
26}

main.js

1const Apify = require('apify');
2const _ = require('lodash');
3
4function createKey(result, idAttr){
5    return result ? (
6        Array.isArray(idAttr) ? 
7        idAttr.map(ida => result[ida]).join('_') : 
8        result[idAttr]
9    ) : null;
10}
11
12async function loadResults(datasetId, process, offset){  
13    const limit = 10000;
14    if(!offset){offset = 0;}
15    const newItems = await Apify.client.datasets.getItems({
16        datasetId, 
17        offset,
18        limit
19    });
20    if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
21        if(newItems.length){await process(newItems);}
22        else if(newItems.items && newItems.items.length){await process(newItems);}
23        await loadResults(datasetId, process, offset + limit);
24    }
25}
26
27async function createCompareMap(oldExecId, idAttr){
28    const data = {};
29    let processed = 0;
30    console.log('creating comparing map');
31    await loadResults(oldExecId, async (fullResults) => {
32        const results = _.chain(fullResults.items).flatten().value();
33        _.each(results, (result, index) => {
34            const key = createKey(result, idAttr);
35            if(key){data[key] = result;}
36        });
37        processed += results.length;
38        console.log('processed old results: ' + processed);
39    });
40    console.log('comparing map created');
41    return data;
42}
43
44async function compareResults(newExecId, compareMap, idAttr, settings){
45    let data = [];
46    let processed = 0, pushData = null;
47    let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
48    
49    if(settings.useDataset){
50        pushData = async (value, flush) => {
51            if(!flush){data.push(value);}
52            if(data.length >= 100 || flush){
53                await Apify.pushData(data);
54                data = [];
55            }
56        };
57    }
58    else{pushData = async value => data.push(value);}
59    
60    console.log('comparing results');
61    await loadResults(newExecId, async (fullResults) => {
62        const results = _.chain(fullResults.items).flatten().value();
63        for(const result of results){
64            const id = createKey(result, idAttr);
65            if(id){
66                const oldResult = compareMap ? compareMap[id] : null;
67                if(!oldResult){
68                    if(settings.addStatus){result[settings.statusAttr] = 'NEW';}
69                    if(settings.returnNew){await pushData(result);}//data.push(result);}
70                    newCount++;
71                }
72                else if(!_.isEqual(result, oldResult)){
73                    const addUpdated = async function(changes){
74                        if(settings.addStatus){result[settings.statusAttr] = 'UPDATED';}
75                        if(settings.returnUpd){
76                            if(settings.addChanges){
77                                const tChanges = changes || getChangeAttributes(oldResult, result);
78                                result[settings.changesAttr] = settings.stringifyChanges ? tChanges.join(', ') : tChanges;
79                            }
80                            await pushData(result);//data.push(result);
81                        }
82                        updCount++;
83                    }
84                    if(settings.updatedIf){
85                        const changes = getChangeAttributes(oldResult, result);
86                        const intersection = _.intersection(settings.updatedIf, changes);
87                        if(!intersection.length){
88                            if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
89                            if(settings.returnUnc){await pushData(result);}//data.push(result);}
90                            uncCount++;
91                        }
92                        else{await addUpdated(intersection);}
93                    }
94                    else{await addUpdated();}
95                }
96                else{
97                    if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
98                    if(settings.returnUnc){await pushData(result);}//data.push(result);}
99                    uncCount++;
100                }
101                if(compareMap){delete compareMap[id];}
102            }
103            else{console.log('record is missing id (' + idAttr + '): ' + JSON.stringify(result));}
104        }
105        processed += results.length;
106        console.log('compared new results: ' + processed);
107    });
108    console.log('comparing results finished');
109    
110    if(compareMap && settings.returnDel){
111        console.log('processing deleted results');
112        const values = Object.values(compareMap);
113        for(const oldResult of values){
114            if(settings.addStatus){oldResult[settings.statusAttr] = 'DELETED';}
115            await pushData(oldResult);//data.push(oldResult);
116            delCount++;
117        }
118        console.log('processing deleted results finished');
119    }
120    
121    console.log('new: ' + newCount + ', updated: ' + updCount + 
122                (settings.returnDel ? (', deleted: ' + delCount) : '') + 
123                ', unchanged: ' + uncCount);
124    if(!settings.useDataset){return data;}
125    else{pushData(null, true);}
126}
127
128function getChangeAttributes(obj1, obj2, prefix, out){
129    const changes = out ? out : [];
130    if(obj1){
131        for(const key in obj1){
132            const v1 = obj1[key];
133            const v2 = obj2 ? obj2[key] : null;
134            if(!_.isEqual(v1, v2)){
135                if(v1 !== null && typeof v1 === 'object'){
136                    getChangeAttributes(v1, v2, key + '/', changes);
137                }
138                else{changes.push(prefix ? prefix + key : key);}
139            }
140        }
141    }
142    return changes;
143}
144
145Apify.main(async () => {
146    const input = await Apify.getValue('INPUT');
147    
148    const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : input;
149    if(!data.idAttr){
150        throw new Error('missing "idAttr" attribute in INPUT');
151    }
152    if(!data.oldDataset){
153        return console.log('warning: missing "oldDataset" attribute in INPUT, all results will be identified as NEW');
154    }
155    if(!data.newDataset){
156        throw new Error('missing "newDataset" attribute in INPUT');
157    }
158    
159    if(data.token){Apify.client.setOptions({token: data.token});}
160    if(data.userId){Apify.client.setOptions({userId: data.userId});}
161    
162    const settings = {};
163    data.return = data.return || 'new, updated';
164    settings.returnNew = data.return.match(/new/i);
165    settings.returnUpd = data.return.match(/updated/i);
166    settings.returnDel = data.return.match(/deleted/i);
167    settings.returnUnc = data.return.match(/unchanged/i);
168    settings.addStatus = data.addStatus ? true : false;
169    settings.addChanges = data.addChanges ? true : false;
170    settings.statusAttr = data.statusAttr ? data.statusAttr : 'status';
171    settings.changesAttr = data.changesAttr ? data.changesAttr : 'changes';
172    settings.stringifyChanges = data.stringifyChanges;
173    settings.updatedIf = data.updatedIf;
174    settings.useDataset = data.useDataset;
175    
176    const compareMap = data.oldDataset ? (await createCompareMap(data.oldDataset, data.idAttr)) : null;
177    const resultData = await compareResults(input._id || data.newDataset, compareMap, data.idAttr, settings);
178    
179    if(resultData){await Apify.setValue('OUTPUT', resultData);}
180    console.log('finished');
181});

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "lodash": "latest",
10        "bluebird": "latest"
11    },
12    "scripts": {
13        "start": "node main.js"
14    }
15}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.