Contact Details Merge & Deduplicate avatar
Contact Details Merge & Deduplicate

Pricing

Pay per usage

Go to Store
Contact Details Merge & Deduplicate

Contact Details Merge & Deduplicate

Developed by

Lukáš Křivka

Maintained by Community

Merge and deduplicate all contacts extracted by Contact Details Scraper. Works with multiple datasets. One row per domain.

0.0 (0)

Pricing

Pay per usage

7

Monthly users

3

0

Last modified

2 years ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-19",
4    "title": "Scrape single page in JavaScript",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "datasetIds": {
7            "title": "Contact Details Dataset IDs",
8            "type": "array",
9            "description": "Provide one or more datasets from finished Contact Details Scraper runs",
10            "editor": "stringList"
11        },
12        "mergeToOneCell": {
13            "title": "Merge contacts of one type to a single column",
14            "type": "boolean",
15            "description": "By default, each contact will have a separate column. This option merges them to single column with a ';' as separator",
16            "default": false
17        }
18    },
19    "required": ["datasetIds"]
20}

src/main.js

1import { Actor } from "apify";
2
3await Actor.init();
4const { datasetIds, mergeToOneCell = false } = await Actor.getInput();
5
6// Copied most of the code from contact-info-bundle actor
7const transformFunction = (items, { customInputData }) => {
8    const CONTACT_FIELDS = ['emails', 'phones', 'phonesUncertain', 'linkedIns', 'twitters', 'instagrams', 'facebooks',
9        'youtubes', 'tiktoks', 'pinterests', 'discords'];
10
11    const contactsPerDomain = {};
12    
13    const { mergeToOneCell } = customInputData;
14
15    for (const item of items) {
16        const { domain } = item;
17
18        for (const contactField of CONTACT_FIELDS) {
19            contactsPerDomain[domain] ??= {};
20            contactsPerDomain[domain][contactField] ??= new Set();
21            for (const contact of item[contactField] || []) {
22                contactsPerDomain[domain][contactField].add(contact);
23            }
24        }
25    }
26
27    // Now we iterate the domains and format the data
28    const results = [];
29    for (const [domain, contacts] of Object.entries(contactsPerDomain)) {
30        const contactsFormatted = {};
31        for (const [contactField, contactSet] of Object.entries(contacts)) {
32            contactsFormatted[contactField] = [...contactSet];
33            if (mergeToOneCell) {
34                contactsFormatted[contactField] = contactsFormatted[contactField].join('; ');
35            }
36        }
37        results.push({
38            domain,
39            ...contactsFormatted,
40        });
41    }
42    return results;
43}
44
45await Actor.metamorph(
46    'lukaskrivka/dedup-datasets',
47    {
48        datasetIds,
49        preDedupTransformFunction: transformFunction,
50        customInputData: { mergeToOneCell },
51    }
52)
53
54// Save headings to Dataset - a table-like storage.
55await Actor.pushData(headings);
56
57// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
58await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json

package.json

1{
2    "name": "js-scrape-single-page",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.0.0"
11    },
12    "scripts": {
13        "start": "node ./src/main.js",
14        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
15    },
16    "author": "It's not you it's me",
17    "license": "ISC"
18}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.