Contact Details Merge & Deduplicate
Try for free
No credit card required
View all Actors
Contact Details Merge & Deduplicate
lukaskrivka/contact-details-merge-deduplicate
Try for free
No credit card required
Merge and deduplicate all contacts extracted by Contact Details Scraper. Works with multiple datasets. One row per domain.
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor-19",
4 "title": "Scrape single page in JavaScript",
5 "description": "Scrape data from single page with provided URL.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-start"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "Scrape data from a web page",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "datasetIds": {
7 "title": "Contact Details Dataset IDs",
8 "type": "array",
9 "description": "Provide one or more datasets from finished Contact Details Scraper runs",
10 "editor": "stringList"
11 },
12 "mergeToOneCell": {
13 "title": "Merge contacts of one type to a single column",
14 "type": "boolean",
15 "description": "By default, each contact will have a separate column. This option merges them to single column with a ';' as separator",
16 "default": false
17 }
18 },
19 "required": ["datasetIds"]
20}
src/main.js
1import { Actor } from "apify";
2
3await Actor.init();
4const { datasetIds, mergeToOneCell = false } = await Actor.getInput();
5
6// Copied most of the code from contact-info-bundle actor
7const transformFunction = (items, { customInputData }) => {
8 const CONTACT_FIELDS = ['emails', 'phones', 'phonesUncertain', 'linkedIns', 'twitters', 'instagrams', 'facebooks',
9 'youtubes', 'tiktoks', 'pinterests', 'discords'];
10
11 const contactsPerDomain = {};
12
13 const { mergeToOneCell } = customInputData;
14
15 for (const item of items) {
16 const { domain } = item;
17
18 for (const contactField of CONTACT_FIELDS) {
19 contactsPerDomain[domain] ??= {};
20 contactsPerDomain[domain][contactField] ??= new Set();
21 for (const contact of item[contactField] || []) {
22 contactsPerDomain[domain][contactField].add(contact);
23 }
24 }
25 }
26
27 // Now we iterate the domains and format the data
28 const results = [];
29 for (const [domain, contacts] of Object.entries(contactsPerDomain)) {
30 const contactsFormatted = {};
31 for (const [contactField, contactSet] of Object.entries(contacts)) {
32 contactsFormatted[contactField] = [...contactSet];
33 if (mergeToOneCell) {
34 contactsFormatted[contactField] = contactsFormatted[contactField].join('; ');
35 }
36 }
37 results.push({
38 domain,
39 ...contactsFormatted,
40 });
41 }
42 return results;
43}
44
45await Actor.metamorph(
46 'lukaskrivka/dedup-datasets',
47 {
48 datasetIds,
49 preDedupTransformFunction: transformFunction,
50 customInputData: { mergeToOneCell },
51 }
52)
53
54// Save headings to Dataset - a table-like storage.
55await Actor.pushData(headings);
56
57// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
58await Actor.exit();
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.gitignore
1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
package.json
1{
2 "name": "js-scrape-single-page",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.0.0"
11 },
12 "scripts": {
13 "start": "node ./src/main.js",
14 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
15 },
16 "author": "It's not you it's me",
17 "license": "ISC"
18}
Developer
Maintained by Community
Actor metrics
- 25 monthly users
- 4 stars
- 95.6% runs succeeded
- Created in Jul 2023
- Modified about 1 year ago
Categories