Contact Details Merge & Deduplicate avatar
Contact Details Merge & Deduplicate
Try for free

No credit card required

View all Actors
Contact Details Merge & Deduplicate

Contact Details Merge & Deduplicate

lukaskrivka/contact-details-merge-deduplicate
Try for free

No credit card required

Merge and deduplicate all contacts extracted by Contact Details Scraper. Works with multiple datasets. One row per domain.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-19",
4    "title": "Scrape single page in JavaScript",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "datasetIds": {
7            "title": "Contact Details Dataset IDs",
8            "type": "array",
9            "description": "Provide one or more datasets from finished Contact Details Scraper runs",
10            "editor": "stringList"
11        },
12        "mergeToOneCell": {
13            "title": "Merge contacts of one type to a single column",
14            "type": "boolean",
15            "description": "By default, each contact will have a separate column. This option merges them to single column with a ';' as separator",
16            "default": false
17        }
18    },
19    "required": ["datasetIds"]
20}

src/main.js

1import { Actor } from "apify";
2
3await Actor.init();
4const { datasetIds, mergeToOneCell = false } = await Actor.getInput();
5
6// Copied most of the code from contact-info-bundle actor
7const transformFunction = (items, { customInputData }) => {
8    const CONTACT_FIELDS = ['emails', 'phones', 'phonesUncertain', 'linkedIns', 'twitters', 'instagrams', 'facebooks',
9        'youtubes', 'tiktoks', 'pinterests', 'discords'];
10
11    const contactsPerDomain = {};
12    
13    const { mergeToOneCell } = customInputData;
14
15    for (const item of items) {
16        const { domain } = item;
17
18        for (const contactField of CONTACT_FIELDS) {
19            contactsPerDomain[domain] ??= {};
20            contactsPerDomain[domain][contactField] ??= new Set();
21            for (const contact of item[contactField] || []) {
22                contactsPerDomain[domain][contactField].add(contact);
23            }
24        }
25    }
26
27    // Now we iterate the domains and format the data
28    const results = [];
29    for (const [domain, contacts] of Object.entries(contactsPerDomain)) {
30        const contactsFormatted = {};
31        for (const [contactField, contactSet] of Object.entries(contacts)) {
32            contactsFormatted[contactField] = [...contactSet];
33            if (mergeToOneCell) {
34                contactsFormatted[contactField] = contactsFormatted[contactField].join('; ');
35            }
36        }
37        results.push({
38            domain,
39            ...contactsFormatted,
40        });
41    }
42    return results;
43}
44
45await Actor.metamorph(
46    'lukaskrivka/dedup-datasets',
47    {
48        datasetIds,
49        preDedupTransformFunction: transformFunction,
50        customInputData: { mergeToOneCell },
51    }
52)
53
54// Save headings to Dataset - a table-like storage.
55await Actor.pushData(headings);
56
57// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
58await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json

package.json

1{
2    "name": "js-scrape-single-page",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.0.0"
11    },
12    "scripts": {
13        "start": "node ./src/main.js",
14        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
15    },
16    "author": "It's not you it's me",
17    "license": "ISC"
18}
Developer
Maintained by Community
Actor metrics
  • 25 monthly users
  • 4 stars
  • 95.6% runs succeeded
  • Created in Jul 2023
  • Modified about 1 year ago