data:image/s3,"s3://crabby-images/4a47f/4a47fae675ba797d823fdbf8f02f32e41d6ee71e" alt="Extended GPT Scraper avatar"
Extended GPT Scraper
No credit card required
data:image/s3,"s3://crabby-images/4a47f/4a47fae675ba797d823fdbf8f02f32e41d6ee71e" alt="Extended GPT Scraper"
Extended GPT Scraper
No credit card required
Extract data from any website and feed it into GPT via the OpenAI API. Use ChatGPT to proofread content, analyze sentiment, summarize reviews, extract contact details, and much more.
You can access the Extended GPT Scraper programmatically from your own applications by using the Apify API. You can choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.
1{
2 "openapi": "3.0.1",
3 "info": {
4 "version": "0.0",
5 "x-build-id": "6pcYVai0QQhxQlZre"
6 },
7 "servers": [
8 {
9 "url": "https://api.apify.com/v2"
10 }
11 ],
12 "paths": {
13 "/acts/drobnikj~extended-gpt-scraper/run-sync-get-dataset-items": {
14 "post": {
15 "operationId": "run-sync-get-dataset-items-drobnikj-extended-gpt-scraper",
16 "x-openai-isConsequential": false,
17 "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
18 "tags": [
19 "Run Actor"
20 ],
21 "requestBody": {
22 "required": true,
23 "content": {
24 "application/json": {
25 "schema": {
26 "$ref": "#/components/schemas/inputSchema"
27 }
28 }
29 }
30 },
31 "parameters": [
32 {
33 "name": "token",
34 "in": "query",
35 "required": true,
36 "schema": {
37 "type": "string"
38 },
39 "description": "Enter your Apify token here"
40 }
41 ],
42 "responses": {
43 "200": {
44 "description": "OK"
45 }
46 }
47 }
48 },
49 "/acts/drobnikj~extended-gpt-scraper/runs": {
50 "post": {
51 "operationId": "runs-sync-drobnikj-extended-gpt-scraper",
52 "x-openai-isConsequential": false,
53 "summary": "Executes an Actor and returns information about the initiated run in response.",
54 "tags": [
55 "Run Actor"
56 ],
57 "requestBody": {
58 "required": true,
59 "content": {
60 "application/json": {
61 "schema": {
62 "$ref": "#/components/schemas/inputSchema"
63 }
64 }
65 }
66 },
67 "parameters": [
68 {
69 "name": "token",
70 "in": "query",
71 "required": true,
72 "schema": {
73 "type": "string"
74 },
75 "description": "Enter your Apify token here"
76 }
77 ],
78 "responses": {
79 "200": {
80 "description": "OK",
81 "content": {
82 "application/json": {
83 "schema": {
84 "$ref": "#/components/schemas/runsResponseSchema"
85 }
86 }
87 }
88 }
89 }
90 }
91 },
92 "/acts/drobnikj~extended-gpt-scraper/run-sync": {
93 "post": {
94 "operationId": "run-sync-drobnikj-extended-gpt-scraper",
95 "x-openai-isConsequential": false,
96 "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
97 "tags": [
98 "Run Actor"
99 ],
100 "requestBody": {
101 "required": true,
102 "content": {
103 "application/json": {
104 "schema": {
105 "$ref": "#/components/schemas/inputSchema"
106 }
107 }
108 }
109 },
110 "parameters": [
111 {
112 "name": "token",
113 "in": "query",
114 "required": true,
115 "schema": {
116 "type": "string"
117 },
118 "description": "Enter your Apify token here"
119 }
120 ],
121 "responses": {
122 "200": {
123 "description": "OK"
124 }
125 }
126 }
127 }
128 },
129 "components": {
130 "schemas": {
131 "inputSchema": {
132 "type": "object",
133 "required": [
134 "startUrls",
135 "instructions",
136 "openaiApiKey",
137 "model"
138 ],
139 "properties": {
140 "startUrls": {
141 "title": "Start URLs",
142 "type": "array",
143 "description": "A static list of URLs to scrape. <br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
144 "items": {
145 "type": "object",
146 "required": [
147 "url"
148 ],
149 "properties": {
150 "url": {
151 "type": "string",
152 "title": "URL of a web page",
153 "format": "uri"
154 }
155 }
156 }
157 },
158 "openaiApiKey": {
159 "title": "OpenAI API key",
160 "type": "string",
161 "description": "The API key for accessing OpenAI. You can get it from <a href='https://platform.openai.com/account/api-keys' target='_blank' rel='noopener'>OpenAI platform</a>."
162 },
163 "instructions": {
164 "title": "Instructions for GPT",
165 "type": "string",
166 "description": "Instruct GPT how to generate text. For example: \"Summarize this page in three sentences.\"<br><br>You can instruct OpenAI to answer with \"skip this page\", which will skip the page. For example: \"Summarize this page in three sentences. If the page is about Apify Proxy, answer with 'skip this page'.\"."
167 },
168 "model": {
169 "title": "GPT model",
170 "enum": [
171 "gpt-3.5-turbo",
172 "gpt-3.5-turbo-16k",
173 "gpt-4",
174 "gpt-4-32k",
175 "text-davinci-003",
176 "gpt-4-turbo",
177 "gpt-4o",
178 "gpt-4o-mini"
179 ],
180 "type": "string",
181 "description": "Select a GPT model. See <a href='https://platform.openai.com/docs/models/overview' target='_blank' rel='noopener'>models overview</a>. Keep in mind that each model has different pricing and features.",
182 "default": "gpt-3.5-turbo"
183 },
184 "includeUrlGlobs": {
185 "title": "Include URLs (globs)",
186 "type": "array",
187 "description": "Glob patterns matching URLs of pages that will be included in crawling. Combine them with the link selector to tell the scraper where to find links. You need to use both globs and link selector to crawl further pages.",
188 "default": [],
189 "items": {
190 "type": "object",
191 "required": [
192 "glob"
193 ],
194 "properties": {
195 "glob": {
196 "type": "string",
197 "title": "Glob of a web page"
198 }
199 }
200 }
201 },
202 "excludeUrlGlobs": {
203 "title": "Exclude URLs (globs)",
204 "type": "array",
205 "description": "Glob patterns matching URLs of pages that will be excluded from crawling. Note that this affects only links found on pages, but not Start URLs, which are always crawled.",
206 "default": [],
207 "items": {
208 "type": "object",
209 "required": [
210 "glob"
211 ],
212 "properties": {
213 "glob": {
214 "type": "string",
215 "title": "Glob of a web page"
216 }
217 }
218 }
219 },
220 "maxCrawlingDepth": {
221 "title": "Max crawling depth",
222 "minimum": 0,
223 "type": "integer",
224 "description": "This specifies how many links away from the <b>Start URLs</b> the scraper will descend. This value is a safeguard against infinite crawling depths for misconfigured scrapers.<br><br>If set to <code>0</code>, there is no limit.",
225 "default": 99999999
226 },
227 "maxPagesPerCrawl": {
228 "title": "Max pages per run",
229 "minimum": 0,
230 "type": "integer",
231 "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
232 "default": 10
233 },
234 "linkSelector": {
235 "title": "Link selector",
236 "type": "string",
237 "description": "This is a CSS selector that says which links on the page (<code><a></code> elements with <code>href</code> attribute) should be followed and added to the request queue. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README."
238 },
239 "initialCookies": {
240 "title": "Initial cookies",
241 "type": "array",
242 "description": "Cookies that will be pre-set to all pages the scraper opens. This is useful for pages that require login. The value is expected to be a JSON array of objects with `name`, `value`, 'domain' and 'path' properties. For example: `[{\"name\": \"cookieName\", \"value\": \"cookieValue\"}, \"domain\": \".domain.com\", \"path\": \"/\"}]`.\n\nYou can use the [EditThisCookie](https://chrome.google.com/webstore/detail/editthiscookie/fngmhnnpilhplaeedifhccceomclgfbg) browser extension to copy browser cookies in this format, and paste it here.",
243 "default": []
244 },
245 "proxyConfiguration": {
246 "title": "Proxy configuration",
247 "type": "object",
248 "description": "This specifies the proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
249 "default": {
250 "useApifyProxy": false
251 }
252 },
253 "temperature": {
254 "title": "Temperature",
255 "type": "string",
256 "description": "Controls randomness: Lowering results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. For consistent results, we recommend setting the temperature to 0.",
257 "default": "0"
258 },
259 "topP": {
260 "title": "TopP",
261 "type": "string",
262 "description": "Controls diversity via nucleus sampling: 0.5 means half of all likelihood-weighted options are considered.",
263 "default": "1"
264 },
265 "frequencyPenalty": {
266 "title": "Frequency penalty",
267 "type": "string",
268 "description": "How much to penalize new tokens based on their existing frequency in the text so far. Decreases the model's likelihood to repeat the same line verbatim.",
269 "default": "0"
270 },
271 "presencePenalty": {
272 "title": "Presence penalty",
273 "type": "string",
274 "description": "How much to penalize new tokens based on whether they appear in the text so far. Increases the model's likelihood to talk about new topics.",
275 "default": "0"
276 },
277 "targetSelector": {
278 "title": "Content selector",
279 "type": "string",
280 "description": "A CSS selector of the HTML element on the page that will be used in the instruction. Instead of a whole page, you can use only part of the page. For example: \"div#content\"."
281 },
282 "removeElementsCssSelector": {
283 "title": "Remove HTML elements (CSS selector)",
284 "type": "string",
285 "description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
286 "default": "script, style, noscript, path, svg, xlink"
287 },
288 "pageFormatInRequest": {
289 "title": "Page format in request",
290 "enum": [
291 "HTML",
292 "Markdown"
293 ],
294 "type": "string",
295 "description": "In what format to send the content extracted from the page to the GPT. Markdown will take less space allowing for larger requests, while HTML may help include some information like attributes that may otherwise be omitted.",
296 "default": "Markdown"
297 },
298 "skipGptGlobs": {
299 "title": "Skip GPT processing for Globs",
300 "type": "array",
301 "description": "This setting allows you to specify certain page URLs to skip GPT instructions for. Pages matching these glob patterns will only be crawled for links, excluding them from GPT processing. Useful for intermediary pages used for navigation or undesired content.",
302 "default": [],
303 "items": {
304 "type": "object",
305 "required": [
306 "glob"
307 ],
308 "properties": {
309 "glob": {
310 "type": "string",
311 "title": "Glob of a web page"
312 }
313 }
314 }
315 },
316 "dynamicContentWaitSecs": {
317 "title": "Wait for dynamic content (seconds)",
318 "minimum": 0,
319 "maximum": 10,
320 "type": "integer",
321 "description": "The maximum time to wait for dynamic page content to load. The crawler will continue either if this time elapses, or if it detects the network became idle as there are no more requests for additional resources.",
322 "default": 0
323 },
324 "removeLinkUrls": {
325 "title": "Remove link URLs",
326 "type": "boolean",
327 "description": "Removes web link URLs while keeping the text content they display.\n- This helps reduce the total page content by eliminating unnecessary URLs before sending to GPT\n- Useful if you are hitting maximum input tokens limits",
328 "default": false
329 },
330 "useStructureOutput": {
331 "title": "Use JSON schema to format answer",
332 "type": "boolean",
333 "description": "If true, the answer will be transformed into a structured format based on the schema in the `jsonAnswer` attribute."
334 },
335 "schema": {
336 "title": "JSON schema format",
337 "type": "object",
338 "description": "Defines how the output will be stored in structured format using the [JSON Schema[JSON Schema](https://json-schema.org/understanding-json-schema/). Keep in mind that it uses [function](https://platform.openai.com/docs/api-reference/chat/create#chat/create-functions), so by setting the description of the fields and the correct title, you can get better results."
339 },
340 "schemaDescription": {
341 "title": "Schema description",
342 "type": "string",
343 "description": "Description of the schema function. Use this to provide more context for the schema.\n\nBy default, the `instructions` field's value is used as the schema description, you can change it here."
344 },
345 "saveSnapshots": {
346 "title": "Save debug snapshots",
347 "type": "boolean",
348 "description": "For each page store its HTML, screenshot and parsed content (markdown/HTML as it was sent to ChatGPT) adding links to these into the output",
349 "default": true
350 }
351 }
352 },
353 "runsResponseSchema": {
354 "type": "object",
355 "properties": {
356 "data": {
357 "type": "object",
358 "properties": {
359 "id": {
360 "type": "string"
361 },
362 "actId": {
363 "type": "string"
364 },
365 "userId": {
366 "type": "string"
367 },
368 "startedAt": {
369 "type": "string",
370 "format": "date-time",
371 "example": "2025-01-08T00:00:00.000Z"
372 },
373 "finishedAt": {
374 "type": "string",
375 "format": "date-time",
376 "example": "2025-01-08T00:00:00.000Z"
377 },
378 "status": {
379 "type": "string",
380 "example": "READY"
381 },
382 "meta": {
383 "type": "object",
384 "properties": {
385 "origin": {
386 "type": "string",
387 "example": "API"
388 },
389 "userAgent": {
390 "type": "string"
391 }
392 }
393 },
394 "stats": {
395 "type": "object",
396 "properties": {
397 "inputBodyLen": {
398 "type": "integer",
399 "example": 2000
400 },
401 "rebootCount": {
402 "type": "integer",
403 "example": 0
404 },
405 "restartCount": {
406 "type": "integer",
407 "example": 0
408 },
409 "resurrectCount": {
410 "type": "integer",
411 "example": 0
412 },
413 "computeUnits": {
414 "type": "integer",
415 "example": 0
416 }
417 }
418 },
419 "options": {
420 "type": "object",
421 "properties": {
422 "build": {
423 "type": "string",
424 "example": "latest"
425 },
426 "timeoutSecs": {
427 "type": "integer",
428 "example": 300
429 },
430 "memoryMbytes": {
431 "type": "integer",
432 "example": 1024
433 },
434 "diskMbytes": {
435 "type": "integer",
436 "example": 2048
437 }
438 }
439 },
440 "buildId": {
441 "type": "string"
442 },
443 "defaultKeyValueStoreId": {
444 "type": "string"
445 },
446 "defaultDatasetId": {
447 "type": "string"
448 },
449 "defaultRequestQueueId": {
450 "type": "string"
451 },
452 "buildNumber": {
453 "type": "string",
454 "example": "1.0.0"
455 },
456 "containerUrl": {
457 "type": "string"
458 },
459 "usage": {
460 "type": "object",
461 "properties": {
462 "ACTOR_COMPUTE_UNITS": {
463 "type": "integer",
464 "example": 0
465 },
466 "DATASET_READS": {
467 "type": "integer",
468 "example": 0
469 },
470 "DATASET_WRITES": {
471 "type": "integer",
472 "example": 0
473 },
474 "KEY_VALUE_STORE_READS": {
475 "type": "integer",
476 "example": 0
477 },
478 "KEY_VALUE_STORE_WRITES": {
479 "type": "integer",
480 "example": 1
481 },
482 "KEY_VALUE_STORE_LISTS": {
483 "type": "integer",
484 "example": 0
485 },
486 "REQUEST_QUEUE_READS": {
487 "type": "integer",
488 "example": 0
489 },
490 "REQUEST_QUEUE_WRITES": {
491 "type": "integer",
492 "example": 0
493 },
494 "DATA_TRANSFER_INTERNAL_GBYTES": {
495 "type": "integer",
496 "example": 0
497 },
498 "DATA_TRANSFER_EXTERNAL_GBYTES": {
499 "type": "integer",
500 "example": 0
501 },
502 "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
503 "type": "integer",
504 "example": 0
505 },
506 "PROXY_SERPS": {
507 "type": "integer",
508 "example": 0
509 }
510 }
511 },
512 "usageTotalUsd": {
513 "type": "number",
514 "example": 0.00005
515 },
516 "usageUsd": {
517 "type": "object",
518 "properties": {
519 "ACTOR_COMPUTE_UNITS": {
520 "type": "integer",
521 "example": 0
522 },
523 "DATASET_READS": {
524 "type": "integer",
525 "example": 0
526 },
527 "DATASET_WRITES": {
528 "type": "integer",
529 "example": 0
530 },
531 "KEY_VALUE_STORE_READS": {
532 "type": "integer",
533 "example": 0
534 },
535 "KEY_VALUE_STORE_WRITES": {
536 "type": "number",
537 "example": 0.00005
538 },
539 "KEY_VALUE_STORE_LISTS": {
540 "type": "integer",
541 "example": 0
542 },
543 "REQUEST_QUEUE_READS": {
544 "type": "integer",
545 "example": 0
546 },
547 "REQUEST_QUEUE_WRITES": {
548 "type": "integer",
549 "example": 0
550 },
551 "DATA_TRANSFER_INTERNAL_GBYTES": {
552 "type": "integer",
553 "example": 0
554 },
555 "DATA_TRANSFER_EXTERNAL_GBYTES": {
556 "type": "integer",
557 "example": 0
558 },
559 "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
560 "type": "integer",
561 "example": 0
562 },
563 "PROXY_SERPS": {
564 "type": "integer",
565 "example": 0
566 }
567 }
568 }
569 }
570 }
571 }
572 }
573 }
574 }
575}
Extended GPT Scraper OpenAPI definition
OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.
OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.
By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.
You can download the OpenAPI definitions for Extended GPT Scraper from the options below:
If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.
You can also check out our other API clients:
Actor Metrics
88 monthly users
-
65 bookmarks
86% runs succeeded
4.4 hours response time
Created in Jun 2023
Modified a month ago