Datasets Compare avatar

Datasets Compare

Try for free

No credit card required

View all Actors
Datasets Compare

Datasets Compare

petr_cermak/datasets-compare
Try for free

No credit card required

Act for comparing crawler execution results. By default the final result set will contain only new and updated records.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-basic:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY  . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

INPUT_SCHEMA.json

1{
2  "title": "Fructidor.fr scraper",
3  "description": "Fructidor.fr scraper",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "idAttr": {
8      "title": "ID attribute",
9      "type": "string",
10      "description": "ID attribute name",
11      "editor": "textfield"
12    },
13    "oldDataset": {
14      "title": "Old dataset ID",
15      "type": "string",
16      "description": "Old dataset ID",
17      "editor": "textfield"
18    },
19    "newDataset": {
20      "title": "New dataset ID",
21      "type": "string",
22      "description": "New dataset ID",
23      "editor": "textfield"
24    }
25  }
26}

main.js

1const Apify = require('apify');
2const _ = require('lodash');
3
4function createKey(result, idAttr){
5    return result ? (
6        Array.isArray(idAttr) ? 
7        idAttr.map(ida => result[ida]).join('_') : 
8        result[idAttr]
9    ) : null;
10}
11
12async function loadResults(datasetId, process, offset){  
13    const limit = 10000;
14    if(!offset){offset = 0;}
15    const newItems = await Apify.client.datasets.getItems({
16        datasetId, 
17        offset,
18        limit
19    });
20    if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
21        if(newItems.length){await process(newItems);}
22        else if(newItems.items && newItems.items.length){await process(newItems);}
23        await loadResults(datasetId, process, offset + limit);
24    }
25}
26
27async function createCompareMap(oldExecId, idAttr){
28    const data = {};
29    let processed = 0;
30    console.log('creating comparing map');
31    await loadResults(oldExecId, async (fullResults) => {
32        const results = _.chain(fullResults.items).flatten().value();
33        _.each(results, (result, index) => {
34            const key = createKey(result, idAttr);
35            if(key){data[key] = result;}
36        });
37        processed += results.length;
38        console.log('processed old results: ' + processed);
39    });
40    console.log('comparing map created');
41    return data;
42}
43
44async function compareResults(newExecId, compareMap, idAttr, settings){
45    let data = [];
46    let processed = 0, pushData = null;
47    let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
48    
49    if(settings.useDataset){
50        pushData = async (value, flush) => {
51            if(!flush){data.push(value);}
52            if(data.length >= 100 || flush){
53                await Apify.pushData(data);
54                data = [];
55            }
56        };
57    }
58    else{pushData = async value => data.push(value);}
59    
60    console.log('comparing results');
61    await loadResults(newExecId, async (fullResults) => {
62        const results = _.chain(fullResults.items).flatten().value();
63        for(const result of results){
64            const id = createKey(result, idAttr);
65            if(id){
66                const oldResult = compareMap ? compareMap[id] : null;
67                if(!oldResult){
68                    if(settings.addStatus){result[settings.statusAttr] = 'NEW';}
69                    if(settings.returnNew){await pushData(result);}//data.push(result);}
70                    newCount++;
71                }
72                else if(!_.isEqual(result, oldResult)){
73                    const addUpdated = async function(changes){
74                        if(settings.addStatus){result[settings.statusAttr] = 'UPDATED';}
75                        if(settings.returnUpd){
76                            if(settings.addChanges){
77                                const tChanges = changes || getChangeAttributes(oldResult, result);
78                                result[settings.changesAttr] = settings.stringifyChanges ? tChanges.join(', ') : tChanges;
79                            }
80                            await pushData(result);//data.push(result);
81                        }
82                        updCount++;
83                    }
84                    if(settings.updatedIf){
85                        const changes = getChangeAttributes(oldResult, result);
86                        const intersection = _.intersection(settings.updatedIf, changes);
87                        if(!intersection.length){
88                            if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
89                            if(settings.returnUnc){await pushData(result);}//data.push(result);}
90                            uncCount++;
91                        }
92                        else{await addUpdated(intersection);}
93                    }
94                    else{await addUpdated();}
95                }
96                else{
97                    if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
98                    if(settings.returnUnc){await pushData(result);}//data.push(result);}
99                    uncCount++;
100                }
101                if(compareMap){delete compareMap[id];}
102            }
103            else{console.log('record is missing id (' + idAttr + '): ' + JSON.stringify(result));}
104        }
105        processed += results.length;
106        console.log('compared new results: ' + processed);
107    });
108    console.log('comparing results finished');
109    
110    if(compareMap && settings.returnDel){
111        console.log('processing deleted results');
112        const values = Object.values(compareMap);
113        for(const oldResult of values){
114            if(settings.addStatus){oldResult[settings.statusAttr] = 'DELETED';}
115            await pushData(oldResult);//data.push(oldResult);
116            delCount++;
117        }
118        console.log('processing deleted results finished');
119    }
120    
121    console.log('new: ' + newCount + ', updated: ' + updCount + 
122                (settings.returnDel ? (', deleted: ' + delCount) : '') + 
123                ', unchanged: ' + uncCount);
124    if(!settings.useDataset){return data;}
125    else{pushData(null, true);}
126}
127
128function getChangeAttributes(obj1, obj2, prefix, out){
129    const changes = out ? out : [];
130    if(obj1){
131        for(const key in obj1){
132            const v1 = obj1[key];
133            const v2 = obj2 ? obj2[key] : null;
134            if(!_.isEqual(v1, v2)){
135                if(v1 !== null && typeof v1 === 'object'){
136                    getChangeAttributes(v1, v2, key + '/', changes);
137                }
138                else{changes.push(prefix ? prefix + key : key);}
139            }
140        }
141    }
142    return changes;
143}
144
145Apify.main(async () => {
146    const input = await Apify.getValue('INPUT');
147    
148    const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : input;
149    if(!data.idAttr){
150        throw new Error('missing "idAttr" attribute in INPUT');
151    }
152    if(!data.oldDataset){
153        return console.log('warning: missing "oldDataset" attribute in INPUT, all results will be identified as NEW');
154    }
155    if(!data.newDataset){
156        throw new Error('missing "newDataset" attribute in INPUT');
157    }
158    
159    if(data.token){Apify.client.setOptions({token: data.token});}
160    if(data.userId){Apify.client.setOptions({userId: data.userId});}
161    
162    const settings = {};
163    data.return = data.return || 'new, updated';
164    settings.returnNew = data.return.match(/new/i);
165    settings.returnUpd = data.return.match(/updated/i);
166    settings.returnDel = data.return.match(/deleted/i);
167    settings.returnUnc = data.return.match(/unchanged/i);
168    settings.addStatus = data.addStatus ? true : false;
169    settings.addChanges = data.addChanges ? true : false;
170    settings.statusAttr = data.statusAttr ? data.statusAttr : 'status';
171    settings.changesAttr = data.changesAttr ? data.changesAttr : 'changes';
172    settings.stringifyChanges = data.stringifyChanges;
173    settings.updatedIf = data.updatedIf;
174    settings.useDataset = data.useDataset;
175    
176    const compareMap = data.oldDataset ? (await createCompareMap(data.oldDataset, data.idAttr)) : null;
177    const resultData = await compareResults(input._id || data.newDataset, compareMap, data.idAttr, settings);
178    
179    if(resultData){await Apify.setValue('OUTPUT', resultData);}
180    console.log('finished');
181});

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "lodash": "latest",
10        "bluebird": "latest"
11    },
12    "scripts": {
13        "start": "node main.js"
14    }
15}
Developer
Maintained by Community

Actor Metrics

  • 1 monthly user

  • 3 stars

  • 0% runs succeeded

  • Created in Apr 2018

  • Modified 3 years ago

Categories