Act for comparing crawler execution results. By default the final result set will contain only new and updated records.
- Modified
- Users12
- Runs2,338
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-basic:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
INPUT_SCHEMA.json
{
"title": "Fructidor.fr scraper",
"description": "Fructidor.fr scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"idAttr": {
"title": "ID attribute",
"type": "string",
"description": "ID attribute name",
"editor": "textfield"
},
"oldDataset": {
"title": "Old dataset ID",
"type": "string",
"description": "Old dataset ID",
"editor": "textfield"
},
"newDataset": {
"title": "New dataset ID",
"type": "string",
"description": "New dataset ID",
"editor": "textfield"
}
}
}
main.js
This file is 180 lines long. Only the first 50 are shown. Show all
const Apify = require('apify');
const _ = require('lodash');
function createKey(result, idAttr){
return result ? (
Array.isArray(idAttr) ?
idAttr.map(ida => result[ida]).join('_') :
result[idAttr]
) : null;
}
async function loadResults(datasetId, process, offset){
const limit = 10000;
if(!offset){offset = 0;}
const newItems = await Apify.client.datasets.getItems({
datasetId,
offset,
limit
});
if(newItems && (newItems.length || (newItems.items && newItems.items.length))){
if(newItems.length){await process(newItems);}
else if(newItems.items && newItems.items.length){await process(newItems);}
await loadResults(datasetId, process, offset + limit);
}
}
async function createCompareMap(oldExecId, idAttr){
const data = {};
let processed = 0;
console.log('creating comparing map');
await loadResults(oldExecId, async (fullResults) => {
const results = _.chain(fullResults.items).flatten().value();
_.each(results, (result, index) => {
const key = createKey(result, idAttr);
if(key){data[key] = result;}
});
processed += results.length;
console.log('processed old results: ' + processed);
});
console.log('comparing map created');
return data;
}
async function compareResults(newExecId, compareMap, idAttr, settings){
let data = [];
let processed = 0, pushData = null;
let newCount = 0, updCount = 0, delCount = 0, uncCount = 0, index = 0;
if(settings.useDataset){
pushData = async (value, flush) => {
package.json
{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10",
"lodash": "latest",
"bluebird": "latest"
},
"scripts": {
"start": "node main.js"
}
}