Dataset Validity Checker avatar

Dataset Validity Checker

Try for free

No credit card required

Go to Store
Dataset Validity Checker

Dataset Validity Checker

equidem/dataset-validity-checker
Try for free

No credit card required

Automatically checks, whether default datasets created by runs of an actor differ too much from the previously encountered ones, allowing it to warn you about web scraping problems caused by, e.g., a website layout changing, or other significant changes in the resulting data.

You can access the Dataset Validity Checker programmatically from your own applications by using the Apify API. You can choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.

1{
2  "openapi": "3.0.1",
3  "info": {
4    "version": "0.0",
5    "x-build-id": "buXRAFS7bifngRFaC"
6  },
7  "servers": [
8    {
9      "url": "https://api.apify.com/v2"
10    }
11  ],
12  "paths": {
13    "/acts/equidem~dataset-validity-checker/run-sync-get-dataset-items": {
14      "post": {
15        "operationId": "run-sync-get-dataset-items-equidem-dataset-validity-checker",
16        "x-openai-isConsequential": false,
17        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
18        "tags": [
19          "Run Actor"
20        ],
21        "requestBody": {
22          "required": true,
23          "content": {
24            "application/json": {
25              "schema": {
26                "$ref": "#/components/schemas/inputSchema"
27              }
28            }
29          }
30        },
31        "parameters": [
32          {
33            "name": "token",
34            "in": "query",
35            "required": true,
36            "schema": {
37              "type": "string"
38            },
39            "description": "Enter your Apify token here"
40          }
41        ],
42        "responses": {
43          "200": {
44            "description": "OK"
45          }
46        }
47      }
48    },
49    "/acts/equidem~dataset-validity-checker/runs": {
50      "post": {
51        "operationId": "runs-sync-equidem-dataset-validity-checker",
52        "x-openai-isConsequential": false,
53        "summary": "Executes an Actor and returns information about the initiated run in response.",
54        "tags": [
55          "Run Actor"
56        ],
57        "requestBody": {
58          "required": true,
59          "content": {
60            "application/json": {
61              "schema": {
62                "$ref": "#/components/schemas/inputSchema"
63              }
64            }
65          }
66        },
67        "parameters": [
68          {
69            "name": "token",
70            "in": "query",
71            "required": true,
72            "schema": {
73              "type": "string"
74            },
75            "description": "Enter your Apify token here"
76          }
77        ],
78        "responses": {
79          "200": {
80            "description": "OK",
81            "content": {
82              "application/json": {
83                "schema": {
84                  "$ref": "#/components/schemas/runsResponseSchema"
85                }
86              }
87            }
88          }
89        }
90      }
91    },
92    "/acts/equidem~dataset-validity-checker/run-sync": {
93      "post": {
94        "operationId": "run-sync-equidem-dataset-validity-checker",
95        "x-openai-isConsequential": false,
96        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
97        "tags": [
98          "Run Actor"
99        ],
100        "requestBody": {
101          "required": true,
102          "content": {
103            "application/json": {
104              "schema": {
105                "$ref": "#/components/schemas/inputSchema"
106              }
107            }
108          }
109        },
110        "parameters": [
111          {
112            "name": "token",
113            "in": "query",
114            "required": true,
115            "schema": {
116              "type": "string"
117            },
118            "description": "Enter your Apify token here"
119          }
120        ],
121        "responses": {
122          "200": {
123            "description": "OK"
124          }
125        }
126      }
127    }
128  },
129  "components": {
130    "schemas": {
131      "inputSchema": {
132        "type": "object",
133        "properties": {
134          "actId": {
135            "title": "Actor Id",
136            "type": "string",
137            "description": "Id of the actor whose datasets the validity checker is supposed to process."
138          },
139          "taskId": {
140            "title": "Task Id",
141            "type": "string",
142            "description": "Id of the task whose datasets the validity checker is supposed to process. Supersedes the actId."
143          },
144          "token": {
145            "title": "User Token",
146            "type": "string",
147            "description": "Token of the user owning the examined actor/task. If not filled, token of the user starting the Dataset Validity Checker is used."
148          },
149          "warningEmail": {
150            "title": "Warning Email",
151            "type": "string",
152            "description": "An email, where warnings about invalid datasets should be sent."
153          },
154          "clearHistory": {
155            "title": "Clear History",
156            "type": "boolean",
157            "description": "Set to true if you want the validity checker to discard all previously gathered information about datasets and start anew. You should use this option if you change the actor in a way that significantly changes its results, or if the website changes significantly in a way, that doesn't actually break your actor (e.g. the amount of different items available for purchase at an e-shop changes drastically).",
158            "default": false
159          },
160          "previousDatasetsTakenIntoAccount": {
161            "title": "Previous Datasets Considered",
162            "minimum": 3,
163            "maximum": 100000,
164            "type": "integer",
165            "description": "A number of previous datasets that will be considered when determining whether the dataset is valid. If not filled, the value will be 100."
166          },
167          "minimalDatasetCount": {
168            "title": "Minimal Datasets",
169            "minimum": 3,
170            "maximum": 100000,
171            "type": "integer",
172            "description": "Minimal number of datasets processed needed to validate further datasets. Needs to be at most the same value as 'Previous Datasets Considered Count'. If not filled, the value will be 10."
173          },
174          "numberHandlingPolicy": {
175            "title": "Number Handling Policy",
176            "enum": [
177              "loose",
178              "strict"
179            ],
180            "type": "string",
181            "description": "Governs what attributes the Dataset Validity Checker considers to be numbers. If it is 'Strict', only values saved as number type will be considered as such. If 'Loose', strings that are numbers in a non-scientific notation are also handled like numbers. 'Strict' policy is generally better, but if you don't convert numbers to the proper type, using 'Loose' should give you better results.",
182            "default": "loose"
183          },
184          "startingAt": {
185            "title": "Starting At",
186            "type": "string",
187            "description": "Allows you to control, what will be the earliest run whose dataset will be processed by this run of Dataset Validity Checker. Will be superseded, if runs from later time have already been processed. Has to be ISO 8601 compliant date/time in UTC."
188          },
189          "until": {
190            "title": "Until",
191            "type": "string",
192            "description": "Allows you to control, what will be the latest run whose dataset will be processed by this run of Dataset Validity Checker. Has to be ISO 8601 compliant date/time in UTC."
193          },
194          "averageMultiplyingCoefficient": {
195            "title": "Average Multiplying Coefficient",
196            "type": "string",
197            "description": "Controls how different the dataset can be compared to the previously seen datasets to still be considered valid in terms of multiples of average difference. Default value is 5."
198          },
199          "maximalMultiplyingCoefficient": {
200            "title": "Maximal Multiplying Coefficient",
201            "type": "string",
202            "description": "Controls how different the dataset can be compared to the previously seen datasets to still be considered valid in terms of multiples of maximal difference. Default value is 2."
203          },
204          "leniencyCoefficient": {
205            "title": "Leniency Coefficient",
206            "type": "string",
207            "description": "Allows you to control both 'Maximal Multiplying Coefficient' and 'Average Multiplying Coefficient' at the same time. Is multiplicative, so a value of 2 increases both of them by a factor of 2. Default value is 1."
208          }
209        }
210      },
211      "runsResponseSchema": {
212        "type": "object",
213        "properties": {
214          "data": {
215            "type": "object",
216            "properties": {
217              "id": {
218                "type": "string"
219              },
220              "actId": {
221                "type": "string"
222              },
223              "userId": {
224                "type": "string"
225              },
226              "startedAt": {
227                "type": "string",
228                "format": "date-time",
229                "example": "2025-01-08T00:00:00.000Z"
230              },
231              "finishedAt": {
232                "type": "string",
233                "format": "date-time",
234                "example": "2025-01-08T00:00:00.000Z"
235              },
236              "status": {
237                "type": "string",
238                "example": "READY"
239              },
240              "meta": {
241                "type": "object",
242                "properties": {
243                  "origin": {
244                    "type": "string",
245                    "example": "API"
246                  },
247                  "userAgent": {
248                    "type": "string"
249                  }
250                }
251              },
252              "stats": {
253                "type": "object",
254                "properties": {
255                  "inputBodyLen": {
256                    "type": "integer",
257                    "example": 2000
258                  },
259                  "rebootCount": {
260                    "type": "integer",
261                    "example": 0
262                  },
263                  "restartCount": {
264                    "type": "integer",
265                    "example": 0
266                  },
267                  "resurrectCount": {
268                    "type": "integer",
269                    "example": 0
270                  },
271                  "computeUnits": {
272                    "type": "integer",
273                    "example": 0
274                  }
275                }
276              },
277              "options": {
278                "type": "object",
279                "properties": {
280                  "build": {
281                    "type": "string",
282                    "example": "latest"
283                  },
284                  "timeoutSecs": {
285                    "type": "integer",
286                    "example": 300
287                  },
288                  "memoryMbytes": {
289                    "type": "integer",
290                    "example": 1024
291                  },
292                  "diskMbytes": {
293                    "type": "integer",
294                    "example": 2048
295                  }
296                }
297              },
298              "buildId": {
299                "type": "string"
300              },
301              "defaultKeyValueStoreId": {
302                "type": "string"
303              },
304              "defaultDatasetId": {
305                "type": "string"
306              },
307              "defaultRequestQueueId": {
308                "type": "string"
309              },
310              "buildNumber": {
311                "type": "string",
312                "example": "1.0.0"
313              },
314              "containerUrl": {
315                "type": "string"
316              },
317              "usage": {
318                "type": "object",
319                "properties": {
320                  "ACTOR_COMPUTE_UNITS": {
321                    "type": "integer",
322                    "example": 0
323                  },
324                  "DATASET_READS": {
325                    "type": "integer",
326                    "example": 0
327                  },
328                  "DATASET_WRITES": {
329                    "type": "integer",
330                    "example": 0
331                  },
332                  "KEY_VALUE_STORE_READS": {
333                    "type": "integer",
334                    "example": 0
335                  },
336                  "KEY_VALUE_STORE_WRITES": {
337                    "type": "integer",
338                    "example": 1
339                  },
340                  "KEY_VALUE_STORE_LISTS": {
341                    "type": "integer",
342                    "example": 0
343                  },
344                  "REQUEST_QUEUE_READS": {
345                    "type": "integer",
346                    "example": 0
347                  },
348                  "REQUEST_QUEUE_WRITES": {
349                    "type": "integer",
350                    "example": 0
351                  },
352                  "DATA_TRANSFER_INTERNAL_GBYTES": {
353                    "type": "integer",
354                    "example": 0
355                  },
356                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
357                    "type": "integer",
358                    "example": 0
359                  },
360                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
361                    "type": "integer",
362                    "example": 0
363                  },
364                  "PROXY_SERPS": {
365                    "type": "integer",
366                    "example": 0
367                  }
368                }
369              },
370              "usageTotalUsd": {
371                "type": "number",
372                "example": 0.00005
373              },
374              "usageUsd": {
375                "type": "object",
376                "properties": {
377                  "ACTOR_COMPUTE_UNITS": {
378                    "type": "integer",
379                    "example": 0
380                  },
381                  "DATASET_READS": {
382                    "type": "integer",
383                    "example": 0
384                  },
385                  "DATASET_WRITES": {
386                    "type": "integer",
387                    "example": 0
388                  },
389                  "KEY_VALUE_STORE_READS": {
390                    "type": "integer",
391                    "example": 0
392                  },
393                  "KEY_VALUE_STORE_WRITES": {
394                    "type": "number",
395                    "example": 0.00005
396                  },
397                  "KEY_VALUE_STORE_LISTS": {
398                    "type": "integer",
399                    "example": 0
400                  },
401                  "REQUEST_QUEUE_READS": {
402                    "type": "integer",
403                    "example": 0
404                  },
405                  "REQUEST_QUEUE_WRITES": {
406                    "type": "integer",
407                    "example": 0
408                  },
409                  "DATA_TRANSFER_INTERNAL_GBYTES": {
410                    "type": "integer",
411                    "example": 0
412                  },
413                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
414                    "type": "integer",
415                    "example": 0
416                  },
417                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
418                    "type": "integer",
419                    "example": 0
420                  },
421                  "PROXY_SERPS": {
422                    "type": "integer",
423                    "example": 0
424                  }
425                }
426              }
427            }
428          }
429        }
430      }
431    }
432  }
433}

Dataset Validity Checker OpenAPI definition

OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.

OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.

By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.

You can download the OpenAPI definitions for Dataset Validity Checker from the options below:

If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.

You can also check out our other API clients:

Developer
Maintained by Community

Actor Metrics

  • 2 monthly users

  • 3 bookmarks

  • 96% runs succeeded

  • Created in Aug 2019

  • Modified 2 years ago

Categories