Duplications Checker avatar

Duplications Checker

Try for free

No credit card required

Go to Store
Duplications Checker

Duplications Checker

lukaskrivka/duplications-checker
Try for free

No credit card required

Check your dataset for duplications. Accept only the highest quality data!

Developer
Maintained by Community

Actor Metrics

  • 9 monthly users

  • No reviews yet

  • 12 bookmarks

  • >99% runs succeeded

  • Created in Aug 2019

  • Modified 4 years ago

You can access the Duplications Checker programmatically from your own applications by using the Apify API. You can also choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.

1{
2  "openapi": "3.0.1",
3  "info": {
4    "version": "0.1",
5    "x-build-id": "DvYXmY5xAeWGd52zH"
6  },
7  "servers": [
8    {
9      "url": "https://api.apify.com/v2"
10    }
11  ],
12  "paths": {
13    "/acts/lukaskrivka~duplications-checker/run-sync-get-dataset-items": {
14      "post": {
15        "operationId": "run-sync-get-dataset-items-lukaskrivka-duplications-checker",
16        "x-openai-isConsequential": false,
17        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
18        "tags": [
19          "Run Actor"
20        ],
21        "requestBody": {
22          "required": true,
23          "content": {
24            "application/json": {
25              "schema": {
26                "$ref": "#/components/schemas/inputSchema"
27              }
28            }
29          }
30        },
31        "parameters": [
32          {
33            "name": "token",
34            "in": "query",
35            "required": true,
36            "schema": {
37              "type": "string"
38            },
39            "description": "Enter your Apify token here"
40          }
41        ],
42        "responses": {
43          "200": {
44            "description": "OK"
45          }
46        }
47      }
48    },
49    "/acts/lukaskrivka~duplications-checker/runs": {
50      "post": {
51        "operationId": "runs-sync-lukaskrivka-duplications-checker",
52        "x-openai-isConsequential": false,
53        "summary": "Executes an Actor and returns information about the initiated run in response.",
54        "tags": [
55          "Run Actor"
56        ],
57        "requestBody": {
58          "required": true,
59          "content": {
60            "application/json": {
61              "schema": {
62                "$ref": "#/components/schemas/inputSchema"
63              }
64            }
65          }
66        },
67        "parameters": [
68          {
69            "name": "token",
70            "in": "query",
71            "required": true,
72            "schema": {
73              "type": "string"
74            },
75            "description": "Enter your Apify token here"
76          }
77        ],
78        "responses": {
79          "200": {
80            "description": "OK",
81            "content": {
82              "application/json": {
83                "schema": {
84                  "$ref": "#/components/schemas/runsResponseSchema"
85                }
86              }
87            }
88          }
89        }
90      }
91    },
92    "/acts/lukaskrivka~duplications-checker/run-sync": {
93      "post": {
94        "operationId": "run-sync-lukaskrivka-duplications-checker",
95        "x-openai-isConsequential": false,
96        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
97        "tags": [
98          "Run Actor"
99        ],
100        "requestBody": {
101          "required": true,
102          "content": {
103            "application/json": {
104              "schema": {
105                "$ref": "#/components/schemas/inputSchema"
106              }
107            }
108          }
109        },
110        "parameters": [
111          {
112            "name": "token",
113            "in": "query",
114            "required": true,
115            "schema": {
116              "type": "string"
117            },
118            "description": "Enter your Apify token here"
119          }
120        ],
121        "responses": {
122          "200": {
123            "description": "OK"
124          }
125        }
126      }
127    }
128  },
129  "components": {
130    "schemas": {
131      "inputSchema": {
132        "type": "object",
133        "required": [
134          "fields"
135        ],
136        "properties": {
137          "datasetId": {
138            "title": "Dataset ID",
139            "type": "string",
140            "description": "Id of dataset where the data are located. If you need to use other input types like Key value store or raw JSON, look at `Other data sources`"
141          },
142          "checkOnlyCleanItems": {
143            "title": "Check only clean dataset items",
144            "type": "boolean",
145            "description": "Only clean dataset items will be loaded and use for duplications checking if `datasetId` option is provided.",
146            "default": false
147          },
148          "fields": {
149            "title": "Fields",
150            "type": "array",
151            "description": "List of fields in each item that will be checked for duplicates. Each given field must not be nested and it should contain only simple value (string or number). You can prepare your data with preCheckFunction.",
152            "default": [],
153            "items": {
154              "type": "string"
155            }
156          },
157          "preCheckFunction": {
158            "title": "Pre-check function",
159            "type": "string",
160            "description": "You can specify which fields should display in the debug OUTPUT to identify bad items. By default it shows all fields which may make it unnecessary big."
161          },
162          "minDuplications": {
163            "title": "Minimum duplications",
164            "minimum": 2,
165            "type": "integer",
166            "description": "Minimum occurences to be included in the report. Defaults to 2",
167            "default": 2
168          },
169          "showIndexes": {
170            "title": "Show indexes",
171            "type": "boolean",
172            "description": "Indexes of the duplicate items will be shown in the OUTPUT report. Set to false if you don't need them.",
173            "default": true
174          },
175          "showItems": {
176            "title": "Show items",
177            "type": "boolean",
178            "description": "Duplicate items will be pushed to a dataset. Set to false if you don't need them.",
179            "default": true
180          },
181          "showMissing": {
182            "title": "Show missing fields",
183            "type": "boolean",
184            "description": "Items where the values for the `field` is missing or is `null` or `''` will be included in the report.",
185            "default": true
186          },
187          "limit": {
188            "title": "Limit",
189            "minimum": 1,
190            "type": "integer",
191            "description": "How many items will be checked. Default is all"
192          },
193          "offset": {
194            "title": "Offset",
195            "minimum": 1,
196            "type": "integer",
197            "description": "From which item the checking will start. Use with limit to check specific items."
198          },
199          "batchSize": {
200            "title": "Batch Size",
201            "minimum": 1,
202            "type": "integer",
203            "description": "You can change number of loaded and processed items in each batch. This is only needed if you have really huge items.",
204            "default": 1000
205          },
206          "keyValueStoreRecord": {
207            "title": "Key value store Record",
208            "type": "string",
209            "description": "ID and record key if you want to load from KV store. Format is `{keyValueStoreId}+{recordKey}`, e.g. `s5NJ77qFv8b4osiGR+MY-KEY`"
210          },
211          "rawData": {
212            "title": "Raw Data",
213            "type": "array",
214            "description": "Raw JSON array you want to check."
215          }
216        }
217      },
218      "runsResponseSchema": {
219        "type": "object",
220        "properties": {
221          "data": {
222            "type": "object",
223            "properties": {
224              "id": {
225                "type": "string"
226              },
227              "actId": {
228                "type": "string"
229              },
230              "userId": {
231                "type": "string"
232              },
233              "startedAt": {
234                "type": "string",
235                "format": "date-time",
236                "example": "2025-01-08T00:00:00.000Z"
237              },
238              "finishedAt": {
239                "type": "string",
240                "format": "date-time",
241                "example": "2025-01-08T00:00:00.000Z"
242              },
243              "status": {
244                "type": "string",
245                "example": "READY"
246              },
247              "meta": {
248                "type": "object",
249                "properties": {
250                  "origin": {
251                    "type": "string",
252                    "example": "API"
253                  },
254                  "userAgent": {
255                    "type": "string"
256                  }
257                }
258              },
259              "stats": {
260                "type": "object",
261                "properties": {
262                  "inputBodyLen": {
263                    "type": "integer",
264                    "example": 2000
265                  },
266                  "rebootCount": {
267                    "type": "integer",
268                    "example": 0
269                  },
270                  "restartCount": {
271                    "type": "integer",
272                    "example": 0
273                  },
274                  "resurrectCount": {
275                    "type": "integer",
276                    "example": 0
277                  },
278                  "computeUnits": {
279                    "type": "integer",
280                    "example": 0
281                  }
282                }
283              },
284              "options": {
285                "type": "object",
286                "properties": {
287                  "build": {
288                    "type": "string",
289                    "example": "latest"
290                  },
291                  "timeoutSecs": {
292                    "type": "integer",
293                    "example": 300
294                  },
295                  "memoryMbytes": {
296                    "type": "integer",
297                    "example": 1024
298                  },
299                  "diskMbytes": {
300                    "type": "integer",
301                    "example": 2048
302                  }
303                }
304              },
305              "buildId": {
306                "type": "string"
307              },
308              "defaultKeyValueStoreId": {
309                "type": "string"
310              },
311              "defaultDatasetId": {
312                "type": "string"
313              },
314              "defaultRequestQueueId": {
315                "type": "string"
316              },
317              "buildNumber": {
318                "type": "string",
319                "example": "1.0.0"
320              },
321              "containerUrl": {
322                "type": "string"
323              },
324              "usage": {
325                "type": "object",
326                "properties": {
327                  "ACTOR_COMPUTE_UNITS": {
328                    "type": "integer",
329                    "example": 0
330                  },
331                  "DATASET_READS": {
332                    "type": "integer",
333                    "example": 0
334                  },
335                  "DATASET_WRITES": {
336                    "type": "integer",
337                    "example": 0
338                  },
339                  "KEY_VALUE_STORE_READS": {
340                    "type": "integer",
341                    "example": 0
342                  },
343                  "KEY_VALUE_STORE_WRITES": {
344                    "type": "integer",
345                    "example": 1
346                  },
347                  "KEY_VALUE_STORE_LISTS": {
348                    "type": "integer",
349                    "example": 0
350                  },
351                  "REQUEST_QUEUE_READS": {
352                    "type": "integer",
353                    "example": 0
354                  },
355                  "REQUEST_QUEUE_WRITES": {
356                    "type": "integer",
357                    "example": 0
358                  },
359                  "DATA_TRANSFER_INTERNAL_GBYTES": {
360                    "type": "integer",
361                    "example": 0
362                  },
363                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
364                    "type": "integer",
365                    "example": 0
366                  },
367                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
368                    "type": "integer",
369                    "example": 0
370                  },
371                  "PROXY_SERPS": {
372                    "type": "integer",
373                    "example": 0
374                  }
375                }
376              },
377              "usageTotalUsd": {
378                "type": "number",
379                "example": 0.00005
380              },
381              "usageUsd": {
382                "type": "object",
383                "properties": {
384                  "ACTOR_COMPUTE_UNITS": {
385                    "type": "integer",
386                    "example": 0
387                  },
388                  "DATASET_READS": {
389                    "type": "integer",
390                    "example": 0
391                  },
392                  "DATASET_WRITES": {
393                    "type": "integer",
394                    "example": 0
395                  },
396                  "KEY_VALUE_STORE_READS": {
397                    "type": "integer",
398                    "example": 0
399                  },
400                  "KEY_VALUE_STORE_WRITES": {
401                    "type": "number",
402                    "example": 0.00005
403                  },
404                  "KEY_VALUE_STORE_LISTS": {
405                    "type": "integer",
406                    "example": 0
407                  },
408                  "REQUEST_QUEUE_READS": {
409                    "type": "integer",
410                    "example": 0
411                  },
412                  "REQUEST_QUEUE_WRITES": {
413                    "type": "integer",
414                    "example": 0
415                  },
416                  "DATA_TRANSFER_INTERNAL_GBYTES": {
417                    "type": "integer",
418                    "example": 0
419                  },
420                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
421                    "type": "integer",
422                    "example": 0
423                  },
424                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
425                    "type": "integer",
426                    "example": 0
427                  },
428                  "PROXY_SERPS": {
429                    "type": "integer",
430                    "example": 0
431                  }
432                }
433              }
434            }
435          }
436        }
437      }
438    }
439  }
440}

Duplications Checker OpenAPI definition

OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.

OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.

By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.

You can download the OpenAPI definitions for Duplications Checker from the options below:

If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.

You can also check out our other API clients: