Contact Details Merge & Deduplicate avatar
Contact Details Merge & Deduplicate

Pricing

Pay per usage

Go to Store
Contact Details Merge & Deduplicate

Contact Details Merge & Deduplicate

Developed by

Lukáš Křivka

Lukáš Křivka

Maintained by Community

Merge and deduplicate all contacts extracted by Contact Details Scraper. Works with multiple datasets. One row per domain.

0.0 (0)

Pricing

Pay per usage

7

Total users

94

Monthly users

4

Runs succeeded

>99%

Last modified

2 years ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-19",
"title": "Scrape single page in JavaScript",
"description": "Scrape data from single page with provided URL.",
"version": "0.0",
"meta": {
"templateId": "js-start"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Scrape data from a web page",
"type": "object",
"schemaVersion": 1,
"properties": {
"datasetIds": {
"title": "Contact Details Dataset IDs",
"type": "array",
"description": "Provide one or more datasets from finished Contact Details Scraper runs",
"editor": "stringList"
},
"mergeToOneCell": {
"title": "Merge contacts of one type to a single column",
"type": "boolean",
"description": "By default, each contact will have a separate column. This option merges them to single column with a ';' as separator",
"default": false
}
},
"required": ["datasetIds"]
}

src/main.js

1import { Actor } from "apify";
2
3await Actor.init();
4const { datasetIds, mergeToOneCell = false } = await Actor.getInput();
5
6// Copied most of the code from contact-info-bundle actor
7const transformFunction = (items, { customInputData }) => {
8 const CONTACT_FIELDS = ['emails', 'phones', 'phonesUncertain', 'linkedIns', 'twitters', 'instagrams', 'facebooks',
9 'youtubes', 'tiktoks', 'pinterests', 'discords'];
10
11 const contactsPerDomain = {};
12
13 const { mergeToOneCell } = customInputData;
14
15 for (const item of items) {
16 const { domain } = item;
17
18 for (const contactField of CONTACT_FIELDS) {
19 contactsPerDomain[domain] ??= {};
20 contactsPerDomain[domain][contactField] ??= new Set();
21 for (const contact of item[contactField] || []) {
22 contactsPerDomain[domain][contactField].add(contact);
23 }
24 }
25 }
26
27 // Now we iterate the domains and format the data
28 const results = [];
29 for (const [domain, contacts] of Object.entries(contactsPerDomain)) {
30 const contactsFormatted = {};
31 for (const [contactField, contactSet] of Object.entries(contacts)) {
32 contactsFormatted[contactField] = [...contactSet];
33 if (mergeToOneCell) {
34 contactsFormatted[contactField] = contactsFormatted[contactField].join('; ');
35 }
36 }
37 results.push({
38 domain,
39 ...contactsFormatted,
40 });
41 }
42 return results;
43}
44
45await Actor.metamorph(
46 'lukaskrivka/dedup-datasets',
47 {
48 datasetIds,
49 preDedupTransformFunction: transformFunction,
50 customInputData: { mergeToOneCell },
51 }
52)
53
54// Save headings to Dataset - a table-like storage.
55await Actor.pushData(headings);
56
57// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
58await Actor.exit();

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

package.json

{
"name": "js-scrape-single-page",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.0.0"
},
"scripts": {
"start": "node ./src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}