Actor picture

Datasets Compare

petr_cermak/datasets-compare

Act for comparing crawler execution results. By default the final result set will contain only new and updated records.

No credit card required

Author's avatarPetr Cermak
  • Modified
  • Users12
  • Runs2,338

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-basic:v0.21.10

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY  . ./

# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

INPUT_SCHEMA.json

{
  "title": "Fructidor.fr scraper",
  "description": "Fructidor.fr scraper",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "idAttr": {
      "title": "ID attribute",
      "type": "string",
      "description": "ID attribute name",
      "editor": "textfield"
    },
    "oldDataset": {
      "title": "Old dataset ID",
      "type": "string",
      "description": "Old dataset ID",
      "editor": "textfield"
    },
    "newDataset": {
      "title": "New dataset ID",
      "type": "string",
      "description": "New dataset ID",
      "editor": "textfield"
    }
  }
}

main.js

This file is 180 lines long. Only the first 50 are shown. Show all

const Apify = require('apify');
const _ = require('lodash');

function createKey(result, idAttr){
    return result ? (
        Array.isArray(idAttr) ? 
        idAttr.map(ida => result[ida]).join('_') : 
        result[idAttr]
    ) : null;
}

async function loadResults(datasetId, process, offset){  
    const limit = 10000;
    if(!offset){offset = 0;}
    const newItems = await Apify.client.datasets.getItems({
        datasetId, 
        offset,
        limit
    });
    if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
        if(newItems.length){await process(newItems);}
        else if(newItems.items && newItems.items.length){await process(newItems);}
        await loadResults(datasetId, process, offset + limit);
    }
}

async function createCompareMap(oldExecId, idAttr){
    const data = {};
    let processed = 0;
    console.log('creating comparing map');
    await loadResults(oldExecId, async (fullResults) => {
        const results = _.chain(fullResults.items).flatten().value();
        _.each(results, (result, index) => {
            const key = createKey(result, idAttr);
            if(key){data[key] = result;}
        });
        processed += results.length;
        console.log('processed old results: ' + processed);
    });
    console.log('comparing map created');
    return data;
}

async function compareResults(newExecId, compareMap, idAttr, settings){
    let data = [];
    let processed = 0, pushData = null;
    let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
    
    if(settings.useDataset){
        pushData = async (value, flush) => {

package.json

{
    "name": "apify-project",
    "version": "0.0.1",
    "description": "",
    "author": "It's not you it's me",
    "license": "ISC",
    "dependencies": {
        "apify": "0.21.10",
        "lodash": "latest",
        "bluebird": "latest"
    },
    "scripts": {
        "start": "node main.js"
    }
}