# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-basic:v0.21.10

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY  . ./

# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

INPUT_SCHEMA.json

{
  "title": "Fructidor.fr scraper",
  "description": "Fructidor.fr scraper",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "idAttr": {
      "title": "ID attribute",
      "type": "string",
      "description": "ID attribute name",
      "editor": "textfield"
    },
    "oldDataset": {
      "title": "Old dataset ID",
      "type": "string",
      "description": "Old dataset ID",
      "editor": "textfield"
    },
    "newDataset": {
      "title": "New dataset ID",
      "type": "string",
      "description": "New dataset ID",
      "editor": "textfield"
    }
  }
}

main.js

1const Apify = require('apify');
2const _ = require('lodash');
3
4function createKey(result, idAttr){
5    return result ? (
6        Array.isArray(idAttr) ? 
7        idAttr.map(ida => result[ida]).join('_') : 
8        result[idAttr]
9    ) : null;
10}
11
12async function loadResults(datasetId, process, offset){  
13    const limit = 10000;
14    if(!offset){offset = 0;}
15    const newItems = await Apify.client.datasets.getItems({
16        datasetId, 
17        offset,
18        limit
19    });
20    if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
21        if(newItems.length){await process(newItems);}
22        else if(newItems.items && newItems.items.length){await process(newItems);}
23        await loadResults(datasetId, process, offset + limit);
24    }
25}
26
27async function createCompareMap(oldExecId, idAttr){
28    const data = {};
29    let processed = 0;
30    console.log('creating comparing map');
31    await loadResults(oldExecId, async (fullResults) => {
32        const results = _.chain(fullResults.items).flatten().value();
33        _.each(results, (result, index) => {
34            const key = createKey(result, idAttr);
35            if(key){data[key] = result;}
36        });
37        processed += results.length;
38        console.log('processed old results: ' + processed);
39    });
40    console.log('comparing map created');
41    return data;
42}
43
44async function compareResults(newExecId, compareMap, idAttr, settings){
45    let data = [];
46    let processed = 0, pushData = null;
47    let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
48    
49    if(settings.useDataset){
50        pushData = async (value, flush) => {
51            if(!flush){data.push(value);}
52            if(data.length >= 100 || flush){
53                await Apify.pushData(data);
54                data = [];
55            }
56        };
57    }
58    else{pushData = async value => data.push(value);}
59    
60    console.log('comparing results');
61    await loadResults(newExecId, async (fullResults) => {
62        const results = _.chain(fullResults.items).flatten().value();
63        for(const result of results){
64            const id = createKey(result, idAttr);
65            if(id){
66                const oldResult = compareMap ? compareMap[id] : null;
67                if(!oldResult){
68                    if(settings.addStatus){result[settings.statusAttr] = 'NEW';}
69                    if(settings.returnNew){await pushData(result);}//data.push(result);}
70                    newCount++;
71                }
72                else if(!_.isEqual(result, oldResult)){
73                    const addUpdated = async function(changes){
74                        if(settings.addStatus){result[settings.statusAttr] = 'UPDATED';}
75                        if(settings.returnUpd){
76                            if(settings.addChanges){
77                                const tChanges = changes || getChangeAttributes(oldResult, result);
78                                result[settings.changesAttr] = settings.stringifyChanges ? tChanges.join(', ') : tChanges;
79                            }
80                            await pushData(result);//data.push(result);
81                        }
82                        updCount++;
83                    }
84                    if(settings.updatedIf){
85                        const changes = getChangeAttributes(oldResult, result);
86                        const intersection = _.intersection(settings.updatedIf, changes);
87                        if(!intersection.length){
88                            if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
89                            if(settings.returnUnc){await pushData(result);}//data.push(result);}
90                            uncCount++;
91                        }
92                        else{await addUpdated(intersection);}
93                    }
94                    else{await addUpdated();}
95                }
96                else{
97                    if(settings.addStatus){result[settings.statusAttr] = 'UNCHANGED';}
98                    if(settings.returnUnc){await pushData(result);}//data.push(result);}
99                    uncCount++;
100                }
101                if(compareMap){delete compareMap[id];}
102            }
103            else{console.log('record is missing id (' + idAttr + '): ' + JSON.stringify(result));}
104        }
105        processed += results.length;
106        console.log('compared new results: ' + processed);
107    });
108    console.log('comparing results finished');
109    
110    if(compareMap && settings.returnDel){
111        console.log('processing deleted results');
112        const values = Object.values(compareMap);
113        for(const oldResult of values){
114            if(settings.addStatus){oldResult[settings.statusAttr] = 'DELETED';}
115            await pushData(oldResult);//data.push(oldResult);
116            delCount++;
117        }
118        console.log('processing deleted results finished');
119    }
120    
121    console.log('new: ' + newCount + ', updated: ' + updCount + 
122                (settings.returnDel ? (', deleted: ' + delCount) : '') + 
123                ', unchanged: ' + uncCount);
124    if(!settings.useDataset){return data;}
125    else{pushData(null, true);}
126}
127
128function getChangeAttributes(obj1, obj2, prefix, out){
129    const changes = out ? out : [];
130    if(obj1){
131        for(const key in obj1){
132            const v1 = obj1[key];
133            const v2 = obj2 ? obj2[key] : null;
134            if(!_.isEqual(v1, v2)){
135                if(v1 !== null && typeof v1 === 'object'){
136                    getChangeAttributes(v1, v2, key + '/', changes);
137                }
138                else{changes.push(prefix ? prefix + key : key);}
139            }
140        }
141    }
142    return changes;
143}
144
145Apify.main(async () => {
146    const input = await Apify.getValue('INPUT');
147    
148    const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : input;
149    if(!data.idAttr){
150        throw new Error('missing "idAttr" attribute in INPUT');
151    }
152    if(!data.oldDataset){
153        return console.log('warning: missing "oldDataset" attribute in INPUT, all results will be identified as NEW');
154    }
155    if(!data.newDataset){
156        throw new Error('missing "newDataset" attribute in INPUT');
157    }
158    
159    if(data.token){Apify.client.setOptions({token: data.token});}
160    if(data.userId){Apify.client.setOptions({userId: data.userId});}
161    
162    const settings = {};
163    data.return = data.return || 'new, updated';
164    settings.returnNew = data.return.match(/new/i);
165    settings.returnUpd = data.return.match(/updated/i);
166    settings.returnDel = data.return.match(/deleted/i);
167    settings.returnUnc = data.return.match(/unchanged/i);
168    settings.addStatus = data.addStatus ? true : false;
169    settings.addChanges = data.addChanges ? true : false;
170    settings.statusAttr = data.statusAttr ? data.statusAttr : 'status';
171    settings.changesAttr = data.changesAttr ? data.changesAttr : 'changes';
172    settings.stringifyChanges = data.stringifyChanges;
173    settings.updatedIf = data.updatedIf;
174    settings.useDataset = data.useDataset;
175    
176    const compareMap = data.oldDataset ? (await createCompareMap(data.oldDataset, data.idAttr)) : null;
177    const resultData = await compareResults(input._id || data.newDataset, compareMap, data.idAttr, settings);
178    
179    if(resultData){await Apify.setValue('OUTPUT', resultData);}
180    console.log('finished');
181});

package.json

{
    "name": "apify-project",
    "version": "0.0.1",
    "description": "",
    "author": "It's not you it's me",
    "license": "ISC",
    "dependencies": {
        "apify": "0.21.10",
        "lodash": "latest",
        "bluebird": "latest"
    },
    "scripts": {
        "start": "node main.js"
    }
}

Json Compare

petr_cermak/json-compare

Act for comparing 2 JSON arrays of objects. By default the final result set will contain only new and updated records.

Petr Cermak

Diff Dataset Fields

lukaskrivka/diff-dataset-fields

Compare changes in text fields between two datasets. Monitor new and updated content on websites.

Lukáš Křivka

Website Changes Detector

tri_angle/website-changes-detector

Automatically track website changes (new, updated, or removed pages) using periodic re-crawling. Supports flexible crawling settings, a list of URLs or sections of websites, antiblocking, filtering by keywords, and more.

Tri⟁angle

Execution To Xlsx

petr_cermak/execution-to-xlsx

The act takes a JSON input containing a crawler executionID, downloads the results and converts them to XLSX.

Petr Cermak

MySQL Insert

petr_cermak/MySQL-insert

This act takes a crawler execution and inserts it's results into a remote MySQL database.

Petr Cermak

PostgreSQL Insert

petr_cermak/PostgreSQL-insert

This act takes a crawler execution and inserts it's results into a remote PostgreSQL database.

Petr Cermak

MX Records

samdangerr132/my-actor

SaM DangeR

Sitemap Change Orchestrator

tri_angle/sitemap-change-orchestrator

Monitor website sitemaps for new, updated, or removed URLs. Integration with the Website Content Crawler (WCC) allows feeding only relevant URLs. This ensures your web crawls are efficient, targeted, and resource-optimized, keeping your datasets fresh for any application.

Tri⟁angle

Incremental Web Crawler

flamboyant_leaf/IncrementalCrawler-v2

The Incremental Crawler efficiently fetches URLs of recently added or updated web pages on a target site, optimizing resources by focusing only on new content. Ideal for keeping up with the latest updates, it integrates seamlessly into workflows for content monitoring and analysis.

AIRabbit

Sreality flat and house search

barusla/sreality-flat-and-house-search

This actor will scrape new flats or houses in your desired sreality location with your set filters. Set up the actor run to once a day and get an email with new flats!