Contact Details Merge & Deduplicate
Pricing
Pay per usage
Go to Store
Contact Details Merge & Deduplicate
Merge and deduplicate all contacts extracted by Contact Details Scraper. Works with multiple datasets. One row per domain.
0.0 (0)
Pricing
Pay per usage
7
Total users
94
Monthly users
4
Runs succeeded
>99%
Last modified
2 years ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-19", "title": "Scrape single page in JavaScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "js-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "datasetIds": { "title": "Contact Details Dataset IDs", "type": "array", "description": "Provide one or more datasets from finished Contact Details Scraper runs", "editor": "stringList" }, "mergeToOneCell": { "title": "Merge contacts of one type to a single column", "type": "boolean", "description": "By default, each contact will have a separate column. This option merges them to single column with a ';' as separator", "default": false } }, "required": ["datasetIds"]}
src/main.js
1import { Actor } from "apify";2
3await Actor.init();4const { datasetIds, mergeToOneCell = false } = await Actor.getInput();5
6// Copied most of the code from contact-info-bundle actor7const transformFunction = (items, { customInputData }) => {8 const CONTACT_FIELDS = ['emails', 'phones', 'phonesUncertain', 'linkedIns', 'twitters', 'instagrams', 'facebooks',9 'youtubes', 'tiktoks', 'pinterests', 'discords'];10
11 const contactsPerDomain = {};12 13 const { mergeToOneCell } = customInputData;14
15 for (const item of items) {16 const { domain } = item;17
18 for (const contactField of CONTACT_FIELDS) {19 contactsPerDomain[domain] ??= {};20 contactsPerDomain[domain][contactField] ??= new Set();21 for (const contact of item[contactField] || []) {22 contactsPerDomain[domain][contactField].add(contact);23 }24 }25 }26
27 // Now we iterate the domains and format the data28 const results = [];29 for (const [domain, contacts] of Object.entries(contactsPerDomain)) {30 const contactsFormatted = {};31 for (const [contactField, contactSet] of Object.entries(contacts)) {32 contactsFormatted[contactField] = [...contactSet];33 if (mergeToOneCell) {34 contactsFormatted[contactField] = contactsFormatted[contactField].join('; ');35 }36 }37 results.push({38 domain,39 ...contactsFormatted,40 });41 }42 return results;43}44
45await Actor.metamorph(46 'lukaskrivka/dedup-datasets',47 {48 datasetIds,49 preDedupTransformFunction: transformFunction,50 customInputData: { mergeToOneCell },51 }52)53
54// Save headings to Dataset - a table-like storage.55await Actor.pushData(headings);56
57// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().58await Actor.exit();
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.gitignore
# This file tells Git which files shouldn't be added to source control.DS_Store.ideadistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
package.json
{ "name": "js-scrape-single-page", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.0.0" }, "scripts": { "start": "node ./src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}