article-scrapper
Pricing
$3.00 / 1,000 result_sets
article-scrapper
A flexible and powerful Apify Actor for scraping articles from tech news websites. This scraper can work with any tech news site - either from predefined presets or custom URLs
Pricing
$3.00 / 1,000 result_sets
A flexible and powerful Apify Actor for scraping articles from tech news websites. This scraper can work with any tech news site - either from predefined presets or custom URLs
You can access the article-scrapper programmatically from your own applications by using the Apify API. You can also choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.
{ "openapi": "3.0.1", "info": { "version": "0.0", "x-build-id": "g62XCcN0o8U5F6RUh" }, "servers": [ { "url": "https://api.apify.com/v2" } ], "paths": { "/acts/credible_sandal~article-scrapper/run-sync-get-dataset-items": { "post": { "operationId": "run-sync-get-dataset-items-credible_sandal-article-scrapper", "x-openai-isConsequential": false, "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.", "tags": [ "Run Actor" ], "requestBody": { "required": true, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/inputSchema" } } } }, "parameters": [ { "name": "token", "in": "query", "required": true, "schema": { "type": "string" }, "description": "Enter your Apify token here" } ], "responses": { "200": { "description": "OK" } } } }, "/acts/credible_sandal~article-scrapper/runs": { "post": { "operationId": "runs-sync-credible_sandal-article-scrapper", "x-openai-isConsequential": false, "summary": "Executes an Actor and returns information about the initiated run in response.", "tags": [ "Run Actor" ], "requestBody": { "required": true, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/inputSchema" } } } }, "parameters": [ { "name": "token", "in": "query", "required": true, "schema": { "type": "string" }, "description": "Enter your Apify token here" } ], "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/runsResponseSchema" } } } } } } }, "/acts/credible_sandal~article-scrapper/run-sync": { "post": { "operationId": "run-sync-credible_sandal-article-scrapper", "x-openai-isConsequential": false, "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.", "tags": [ "Run Actor" ], "requestBody": { "required": true, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/inputSchema" } } } }, "parameters": [ { "name": "token", "in": "query", "required": true, "schema": { "type": "string" }, "description": "Enter your Apify token here" } ], "responses": { "200": { "description": "OK" } } } } }, "components": { "schemas": { "inputSchema": { "type": "object", "properties": { "usePresets": { "title": "Use Preset Sites", "type": "boolean", "description": "Use predefined news sites or provide custom URLs", "default": true }, "presetSources": { "title": "Preset News Sources", "uniqueItems": true, "type": "array", "description": "Select from predefined tech news sources (only used if 'Use Preset Sites' is checked)", "items": { "type": "string", "enum": [ "verge", "cnet", "wired", "techcrunch", "arstechnica", "engadget", "theguardian-tech", "thenextweb" ] }, "default": [ "verge", "techcrunch", "wired" ] }, "customUrls": { "title": "Custom URLs", "uniqueItems": true, "type": "array", "description": "Enter custom URLs to scrape (one per line). Can be homepages or article listing pages. Used when 'Use Preset Sites' is unchecked.", "items": { "type": "string", "pattern": "^https?://.*" } }, "maxArticlesPerSource": { "title": "Max Articles Per Source", "minimum": 1, "maximum": 100, "type": "integer", "description": "Maximum number of articles to scrape from each source", "default": 10 }, "maxPages": { "title": "Max Listing Pages", "minimum": 1, "maximum": 10, "type": "integer", "description": "Maximum number of listing pages to check per source", "default": 1 }, "includeContent": { "title": "Include Full Article Content", "type": "boolean", "description": "Extract full article text (may increase scraping time)", "default": true }, "searchKeywords": { "title": "Search Keywords (Optional)", "type": "array", "description": "Filter articles by keywords. Only articles containing ANY of these keywords in title or description will be included. Leave empty to scrape all articles.", "items": { "type": "string" } }, "titleContains": { "title": "Title Must Contain (Optional)", "type": "string", "description": "Only scrape articles where the title contains this text (case-insensitive). Leave empty to scrape all articles." }, "descriptionContains": { "title": "Description Must Contain (Optional)", "type": "string", "description": "Only scrape articles where the summary/description contains this text (case-insensitive). Leave empty to scrape all articles." } } }, "runsResponseSchema": { "type": "object", "properties": { "data": { "type": "object", "properties": { "id": { "type": "string" }, "actId": { "type": "string" }, "userId": { "type": "string" }, "startedAt": { "type": "string", "format": "date-time", "example": "2025-01-08T00:00:00.000Z" }, "finishedAt": { "type": "string", "format": "date-time", "example": "2025-01-08T00:00:00.000Z" }, "status": { "type": "string", "example": "READY" }, "meta": { "type": "object", "properties": { "origin": { "type": "string", "example": "API" }, "userAgent": { "type": "string" } } }, "stats": { "type": "object", "properties": { "inputBodyLen": { "type": "integer", "example": 2000 }, "rebootCount": { "type": "integer", "example": 0 }, "restartCount": { "type": "integer", "example": 0 }, "resurrectCount": { "type": "integer", "example": 0 }, "computeUnits": { "type": "integer", "example": 0 } } }, "options": { "type": "object", "properties": { "build": { "type": "string", "example": "latest" }, "timeoutSecs": { "type": "integer", "example": 300 }, "memoryMbytes": { "type": "integer", "example": 1024 }, "diskMbytes": { "type": "integer", "example": 2048 } } }, "buildId": { "type": "string" }, "defaultKeyValueStoreId": { "type": "string" }, "defaultDatasetId": { "type": "string" }, "defaultRequestQueueId": { "type": "string" }, "buildNumber": { "type": "string", "example": "1.0.0" }, "containerUrl": { "type": "string" }, "usage": { "type": "object", "properties": { "ACTOR_COMPUTE_UNITS": { "type": "integer", "example": 0 }, "DATASET_READS": { "type": "integer", "example": 0 }, "DATASET_WRITES": { "type": "integer", "example": 0 }, "KEY_VALUE_STORE_READS": { "type": "integer", "example": 0 }, "KEY_VALUE_STORE_WRITES": { "type": "integer", "example": 1 }, "KEY_VALUE_STORE_LISTS": { "type": "integer", "example": 0 }, "REQUEST_QUEUE_READS": { "type": "integer", "example": 0 }, "REQUEST_QUEUE_WRITES": { "type": "integer", "example": 0 }, "DATA_TRANSFER_INTERNAL_GBYTES": { "type": "integer", "example": 0 }, "DATA_TRANSFER_EXTERNAL_GBYTES": { "type": "integer", "example": 0 }, "PROXY_RESIDENTIAL_TRANSFER_GBYTES": { "type": "integer", "example": 0 }, "PROXY_SERPS": { "type": "integer", "example": 0 } } }, "usageTotalUsd": { "type": "number", "example": 0.00005 }, "usageUsd": { "type": "object", "properties": { "ACTOR_COMPUTE_UNITS": { "type": "integer", "example": 0 }, "DATASET_READS": { "type": "integer", "example": 0 }, "DATASET_WRITES": { "type": "integer", "example": 0 }, "KEY_VALUE_STORE_READS": { "type": "integer", "example": 0 }, "KEY_VALUE_STORE_WRITES": { "type": "number", "example": 0.00005 }, "KEY_VALUE_STORE_LISTS": { "type": "integer", "example": 0 }, "REQUEST_QUEUE_READS": { "type": "integer", "example": 0 }, "REQUEST_QUEUE_WRITES": { "type": "integer", "example": 0 }, "DATA_TRANSFER_INTERNAL_GBYTES": { "type": "integer", "example": 0 }, "DATA_TRANSFER_EXTERNAL_GBYTES": { "type": "integer", "example": 0 }, "PROXY_RESIDENTIAL_TRANSFER_GBYTES": { "type": "integer", "example": 0 }, "PROXY_SERPS": { "type": "integer", "example": 0 } } } } } } } } }}OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.
OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.
By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.
You can download the OpenAPI definitions for article-scrapper from the options below:
If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.
You can also check out our other API clients: