{
	"actorSpecification": 1,
	"name": "my-actor",
	"title": "Scrape single page in TypeScript",
	"description": "Scrape data from single page with provided URL.",
	"version": "0.0",
	"meta": {
		"templateId": "ts-start",
		"model": "<FILL-IN-MODEL>"
	},
	"input": "./input_schema.json",
	"dockerfile": "../Dockerfile",
	"storages": {
		"dataset": "./dataset_schema.json"
	}
}

.actor/dataset_schema.json

{
    "actorSpecification": 1,
    "fields": {},
    "views": {
        "overview": {
            "title": "Overview",
            "transformation": {
                "fields": [
                    "level",
                    "text"
                ]
            },
            "display": {
                "component": "table",
                "properties": {
                    "level": {
                        "label": "Level",
                        "format": "text"
                    },
                    "text": {
                        "label": "Text",
                        "format": "text"
                    }
                }
            }
        }
    }
}

.actor/input_schema.json

{
    "title": "Scrape data from a web page",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "URL of the page",
            "type": "string",
            "description": "The URL of website you want to get the data from.",
            "editor": "textfield",
            "prefill": "https://www.apify.com"
        }
    },
    "required": ["url"]
}

.actor/output_schema.json

{
  "actorOutputSchemaVersion": 1,
  "title": "Output schema of the files scraper",
  "description": "This is the output schema for the files scraper",
  "properties": {
    "level": {
      "type": "string",
      "title": "Level",
      "template": "{{links.apiDefaultDatasetUrl}}/items?view=level"
    },
    "title": {
      "type": "string",
      "title": "Title",
      "template": "{{links.apiDefaultDatasetUrl}}/items?view=title"
    }
  }
}

src/main.ts

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
2import { Actor } from 'apify';
3// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
4import axios from 'axios';
5// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
6import * as cheerio from 'cheerio';
7
8// this is ESM project, and as such, it requires you to specify extensions in your relative imports
9// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
10// note that we need to use `.js` even when inside TS files
11// import { router } from './routes.js';
12
13// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
14await Actor.init();
15
16interface Input {
17    url: string;
18}
19// Structure of input is defined in input_schema.json
20const input = await Actor.getInput<Input>();
21if (!input) throw new Error('Input is missing!');
22const { url } = input;
23
24// Fetch the HTML content of the page.
25const response = await axios.get(url);
26
27// Parse the downloaded HTML with Cheerio to enable data extraction.
28const $ = cheerio.load(response.data);
29
30// Extract all headings from the page (tag name and text).
31const headings: { level: string; text: string }[] = [];
32$('h1, h2, h3, h4, h5, h6').each((_i, element) => {
33    const headingObject = {
34        level: $(element).prop('tagName')!.toLowerCase(),
35        text: $(element).text(),
36    };
37    console.log('Extracted heading', headingObject);
38    headings.push(headingObject);
39});
40
41// Save headings to Dataset - a table-like storage.
42await Actor.pushData(headings);
43
44// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
45await Actor.exit();

.dockerignore

# configurations
.idea
.vscode
.zed

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

# dist folder
dist

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.vscode
.zed
storage
apify_storage
crawlee_storage
node_modules
dist
tsconfig.tsbuildinfo
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json

# Added by Apify CLI
.venv

.prettierignore

.prettierignore

.prettierrc

{
    "printWidth": 120,
    "singleQuote": true,
    "tabWidth": 4
}

AGENTS.md

1# Apify Actors Development Guide
2
3Important: Before you begin, fill in the model property in the meta section of `.actor/actor.json`. Replace it with the model you're currently using. This helps Apify monitor and improve AGENTS.md for specific LLM models.
4
5## What are Apify Actors?
6
7- Actors are serverless programs that run in the cloud. They're inspired by the UNIX philosophy - programs that do one thing well and can be easily combined to build complex systems.
8- Actors are programs packaged as Docker images that run in isolated containers
9
10## Core Concepts
11
12- Accept well-defined JSON input
13- Perform isolated tasks (web scraping, automation, data processing)
14- Produce structured JSON output to datasets and/or store data in key-value stores
15- Can run from seconds to hours or even indefinitely
16- Persist state and can be restarted
17
18## Do
19
20- accept well-defined JSON input and produce structured JSON output
21- use Apify SDK (`apify`) for code running ON Apify platform
22- validate input early with proper error handling and fail gracefully
23- use CheerioCrawler for static HTML content (10x faster than browsers)
24- use PlaywrightCrawler only for JavaScript-heavy sites and dynamic content
25- use router pattern (createCheerioRouter/createPlaywrightRouter) for complex crawls
26- implement retry strategies with exponential backoff for failed requests
27- use proper concurrency settings (HTTP: 10-50, Browser: 1-5)
28- set sensible defaults in `.actor/input_schema.json` for all optional fields
29- set up output schema in `.actor/output_schema.json`
30- clean and validate data before pushing to dataset
31- use semantic CSS selectors and fallback strategies for missing elements
32- respect robots.txt, ToS, and implement rate limiting with delays
33- check which tools (cheerio/playwright/crawlee) are installed before applying guidance
34
35## Don't
36
37- do not rely on `Dataset.getInfo()` for final counts on Cloud platform
38- do not use browser crawlers when HTTP/Cheerio works (massive performance gains with HTTP)
39- do not hard code values that should be in input schema or environment variables
40- do not skip input validation or error handling
41- do not overload servers - use appropriate concurrency and delays
42- do not scrape prohibited content or ignore Terms of Service
43- do not store personal/sensitive data unless explicitly permitted
44- do not use deprecated options like `requestHandlerTimeoutMillis` on CheerioCrawler (v3.x)
45- do not use `additionalHttpHeaders` - use `preNavigationHooks` instead
46
47## Commands
48
49```bash
50# Local development
51apify run                              # Run Actor locally
52
53# Authentication & deployment
54apify login                            # Authenticate account
55apify push                             # Deploy to Apify platform
56
57# Help
58apify help                             # List all commands
59```
60
61## Safety and Permissions
62
63Allowed without prompt:
64
65- read files with `Actor.getValue()`
66- push data with `Actor.pushData()`
67- set values with `Actor.setValue()`
68- enqueue requests to RequestQueue
69- run locally with `apify run`
70
71Ask first:
72
73- npm/pip package installations
74- apify push (deployment to cloud)
75- proxy configuration changes (requires paid plan)
76- Dockerfile changes affecting builds
77- deleting datasets or key-value stores
78
79## Project Structure
80
81.actor/
82├── actor.json # Actor config: name, version, env vars, runtime settings
83├── input_schema.json # Input validation & Console form definition
84└── output_schema.json # Specifies where an Actor stores its output
85src/
86└── main.js # Actor entry point and orchestrator
87storage/ # Local storage (mirrors Cloud during development)
88├── datasets/ # Output items (JSON objects)
89├── key_value_stores/ # Files, config, INPUT
90└── request_queues/ # Pending crawl requests
91Dockerfile # Container image definition
92AGENTS.md # AI agent instructions (this file)
93
94## Actor Input Schema
95
96The input schema defines the input parameters for an Actor. It's a JSON object comprising various field types supported by the Apify platform.
97
98### Structure
99
100```json
101{
102    "title": "<INPUT-SCHEMA-TITLE>",
103    "type": "object",
104    "schemaVersion": 1,
105    "properties": {
106        /* define input fields here */
107    },
108    "required": []
109}
110```
111
112### Example
113
114```json
115{
116    "title": "E-commerce Product Scraper Input",
117    "type": "object",
118    "schemaVersion": 1,
119    "properties": {
120        "startUrls": {
121            "title": "Start URLs",
122            "type": "array",
123            "description": "URLs to start scraping from (category pages or product pages)",
124            "editor": "requestListSources",
125            "default": [{ "url": "https://example.com/category" }],
126            "prefill": [{ "url": "https://example.com/category" }]
127        },
128        "followVariants": {
129            "title": "Follow Product Variants",
130            "type": "boolean",
131            "description": "Whether to scrape product variants (different colors, sizes)",
132            "default": true
133        },
134        "maxRequestsPerCrawl": {
135            "title": "Max Requests per Crawl",
136            "type": "integer",
137            "description": "Maximum number of pages to scrape (0 = unlimited)",
138            "default": 1000,
139            "minimum": 0
140        },
141        "proxyConfiguration": {
142            "title": "Proxy Configuration",
143            "type": "object",
144            "description": "Proxy settings for anti-bot protection",
145            "editor": "proxy",
146            "default": { "useApifyProxy": false }
147        },
148        "locale": {
149            "title": "Locale",
150            "type": "string",
151            "description": "Language/country code for localized content",
152            "default": "cs",
153            "enum": ["cs", "en", "de", "sk"],
154            "enumTitles": ["Czech", "English", "German", "Slovak"]
155        }
156    },
157    "required": ["startUrls"]
158}
159```
160
161## Actor Output Schema
162
163The Actor output schema builds upon the schemas for the dataset and key-value store. It specifies where an Actor stores its output and defines templates for accessing that output. Apify Console uses these output definitions to display run results.
164
165### Structure
166
167```json
168{
169    "actorOutputSchemaVersion": 1,
170    "title": "<OUTPUT-SCHEMA-TITLE>",
171    "properties": {
172        /* define your outputs here */
173    }
174}
175```
176
177### Example
178
179```json
180{
181    "actorOutputSchemaVersion": 1,
182    "title": "Output schema of the files scraper",
183    "properties": {
184        "files": {
185            "type": "string",
186            "title": "Files",
187            "template": "{{links.apiDefaultKeyValueStoreUrl}}/keys"
188        },
189        "dataset": {
190            "type": "string",
191            "title": "Dataset",
192            "template": "{{links.apiDefaultDatasetUrl}}/items"
193        }
194    }
195}
196```
197
198### Output Schema Template Variables
199
200- `links` (object) - Contains quick links to most commonly used URLs
201- `links.publicRunUrl` (string) - Public run url in format `https://console.apify.com/view/runs/:runId`
202- `links.consoleRunUrl` (string) - Console run url in format `https://console.apify.com/actors/runs/:runId`
203- `links.apiRunUrl` (string) - API run url in format `https://api.apify.com/v2/actor-runs/:runId`
204- `links.apiDefaultDatasetUrl` (string) - API url of default dataset in format `https://api.apify.com/v2/datasets/:defaultDatasetId`
205- `links.apiDefaultKeyValueStoreUrl` (string) - API url of default key-value store in format `https://api.apify.com/v2/key-value-stores/:defaultKeyValueStoreId`
206- `links.containerRunUrl` (string) - URL of a webserver running inside the run in format `https://<containerId>.runs.apify.net/`
207- `run` (object) - Contains information about the run same as it is returned from the `GET Run` API endpoint
208- `run.defaultDatasetId` (string) - ID of the default dataset
209- `run.defaultKeyValueStoreId` (string) - ID of the default key-value store
210
211## Dataset Schema Specification
212
213The dataset schema defines how your Actor's output data is structured, transformed, and displayed in the Output tab in the Apify Console.
214
215### Example
216
217Consider an example Actor that calls `Actor.pushData()` to store data into dataset:
218
219```typescript
220import { Actor } from 'apify';
221// Initialize the JavaScript SDK
222await Actor.init();
223
224/**
225 * Actor code
226 */
227await Actor.pushData({
228    numericField: 10,
229    pictureUrl: 'https://www.google.com/images/branding/googlelogo/2x/googlelogo_color_92x30dp.png',
230    linkUrl: 'https://google.com',
231    textField: 'Google',
232    booleanField: true,
233    dateField: new Date(),
234    arrayField: ['#hello', '#world'],
235    objectField: {},
236});
237
238// Exit successfully
239await Actor.exit();
240```
241
242To set up the Actor's output tab UI, reference a dataset schema file in `.actor/actor.json`:
243
244```json
245{
246    "actorSpecification": 1,
247    "name": "book-library-scraper",
248    "title": "Book Library Scraper",
249    "version": "1.0.0",
250    "storages": {
251        "dataset": "./dataset_schema.json"
252    }
253}
254```
255
256Then create the dataset schema in `.actor/dataset_schema.json`:
257
258```json
259{
260    "actorSpecification": 1,
261    "fields": {},
262    "views": {
263        "overview": {
264            "title": "Overview",
265            "transformation": {
266                "fields": [
267                    "pictureUrl",
268                    "linkUrl",
269                    "textField",
270                    "booleanField",
271                    "arrayField",
272                    "objectField",
273                    "dateField",
274                    "numericField"
275                ]
276            },
277            "display": {
278                "component": "table",
279                "properties": {
280                    "pictureUrl": {
281                        "label": "Image",
282                        "format": "image"
283                    },
284                    "linkUrl": {
285                        "label": "Link",
286                        "format": "link"
287                    },
288                    "textField": {
289                        "label": "Text",
290                        "format": "text"
291                    },
292                    "booleanField": {
293                        "label": "Boolean",
294                        "format": "boolean"
295                    },
296                    "arrayField": {
297                        "label": "Array",
298                        "format": "array"
299                    },
300                    "objectField": {
301                        "label": "Object",
302                        "format": "object"
303                    },
304                    "dateField": {
305                        "label": "Date",
306                        "format": "date"
307                    },
308                    "numericField": {
309                        "label": "Number",
310                        "format": "number"
311                    }
312                }
313            }
314        }
315    }
316}
317```
318
319### Structure
320
321```json
322{
323    "actorSpecification": 1,
324    "fields": {},
325    "views": {
326        "<VIEW_NAME>": {
327            "title": "string (required)",
328            "description": "string (optional)",
329            "transformation": {
330                "fields": ["string (required)"],
331                "unwind": ["string (optional)"],
332                "flatten": ["string (optional)"],
333                "omit": ["string (optional)"],
334                "limit": "integer (optional)",
335                "desc": "boolean (optional)"
336            },
337            "display": {
338                "component": "table (required)",
339                "properties": {
340                    "<FIELD_NAME>": {
341                        "label": "string (optional)",
342                        "format": "text|number|date|link|boolean|image|array|object (optional)"
343                    }
344                }
345            }
346        }
347    }
348}
349```
350
351**Dataset Schema Properties:**
352
353- `actorSpecification` (integer, required) - Specifies the version of dataset schema structure document (currently only version 1)
354- `fields` (JSONSchema object, required) - Schema of one dataset object (use JsonSchema Draft 2020-12 or compatible)
355- `views` (DatasetView object, required) - Object with API and UI views description
356
357**DatasetView Properties:**
358
359- `title` (string, required) - Visible in UI Output tab and API
360- `description` (string, optional) - Only available in API response
361- `transformation` (ViewTransformation object, required) - Data transformation applied when loading from Dataset API
362- `display` (ViewDisplay object, required) - Output tab UI visualization definition
363
364**ViewTransformation Properties:**
365
366- `fields` (string[], required) - Fields to present in output (order matches column order)
367- `unwind` (string[], optional) - Deconstructs nested children into parent object
368- `flatten` (string[], optional) - Transforms nested object into flat structure
369- `omit` (string[], optional) - Removes specified fields from output
370- `limit` (integer, optional) - Maximum number of results (default: all)
371- `desc` (boolean, optional) - Sort order (true = newest first)
372
373**ViewDisplay Properties:**
374
375- `component` (string, required) - Only `table` is available
376- `properties` (Object, optional) - Keys matching `transformation.fields` with ViewDisplayProperty values
377
378**ViewDisplayProperty Properties:**
379
380- `label` (string, optional) - Table column header
381- `format` (string, optional) - One of: `text`, `number`, `date`, `link`, `boolean`, `image`, `array`, `object`
382
383## Key-Value Store Schema Specification
384
385The key-value store schema organizes keys into logical groups called collections for easier data management.
386
387### Example
388
389Consider an example Actor that calls `Actor.setValue()` to save records into the key-value store:
390
391```typescript
392import { Actor } from 'apify';
393// Initialize the JavaScript SDK
394await Actor.init();
395
396/**
397 * Actor code
398 */
399await Actor.setValue('document-1', 'my text data', { contentType: 'text/plain' });
400
401await Actor.setValue(`image-${imageID}`, imageBuffer, { contentType: 'image/jpeg' });
402
403// Exit successfully
404await Actor.exit();
405```
406
407To configure the key-value store schema, reference a schema file in `.actor/actor.json`:
408
409```json
410{
411    "actorSpecification": 1,
412    "name": "data-collector",
413    "title": "Data Collector",
414    "version": "1.0.0",
415    "storages": {
416        "keyValueStore": "./key_value_store_schema.json"
417    }
418}
419```
420
421Then create the key-value store schema in `.actor/key_value_store_schema.json`:
422
423```json
424{
425    "actorKeyValueStoreSchemaVersion": 1,
426    "title": "Key-Value Store Schema",
427    "collections": {
428        "documents": {
429            "title": "Documents",
430            "description": "Text documents stored by the Actor",
431            "keyPrefix": "document-"
432        },
433        "images": {
434            "title": "Images",
435            "description": "Images stored by the Actor",
436            "keyPrefix": "image-",
437            "contentTypes": ["image/jpeg"]
438        }
439    }
440}
441```
442
443### Structure
444
445```json
446{
447    "actorKeyValueStoreSchemaVersion": 1,
448    "title": "string (required)",
449    "description": "string (optional)",
450    "collections": {
451        "<COLLECTION_NAME>": {
452            "title": "string (required)",
453            "description": "string (optional)",
454            "key": "string (conditional - use key OR keyPrefix)",
455            "keyPrefix": "string (conditional - use key OR keyPrefix)",
456            "contentTypes": ["string (optional)"],
457            "jsonSchema": "object (optional)"
458        }
459    }
460}
461```
462
463**Key-Value Store Schema Properties:**
464
465- `actorKeyValueStoreSchemaVersion` (integer, required) - Version of key-value store schema structure document (currently only version 1)
466- `title` (string, required) - Title of the schema
467- `description` (string, optional) - Description of the schema
468- `collections` (Object, required) - Object where each key is a collection ID and value is a Collection object
469
470**Collection Properties:**
471
472- `title` (string, required) - Collection title shown in UI tabs
473- `description` (string, optional) - Description appearing in UI tooltips
474- `key` (string, conditional) - Single specific key for this collection
475- `keyPrefix` (string, conditional) - Prefix for keys included in this collection
476- `contentTypes` (string[], optional) - Allowed content types for validation
477- `jsonSchema` (object, optional) - JSON Schema Draft 07 format for `application/json` content type validation
478
479Either `key` or `keyPrefix` must be specified for each collection, but not both.
480
481## Apify MCP Tools
482
483If MCP server is configured, use these tools for documentation:
484
485- `search-apify-docs` - Search documentation
486- `fetch-apify-docs` - Get full doc pages
487
488Otherwise, reference: `@https://mcp.apify.com/`
489
490## Resources
491
492- [docs.apify.com/llms.txt](https://docs.apify.com/llms.txt) - Quick reference
493- [docs.apify.com/llms-full.txt](https://docs.apify.com/llms-full.txt) - Complete docs
494- [crawlee.dev](https://crawlee.dev) - Crawlee documentation
495- [whitepaper.actor](https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/README.md) - Complete Actor specification

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:22 AS builder

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser:myuser package*.json ./

# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false

# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser:myuser . ./

# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build

# Create final image
FROM apify/actor-node:22

# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser:myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Copy built JS files from builder image
COPY --from=builder --chown=myuser:myuser /usr/src/app/dist ./dist

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser:myuser . ./

# Run the image.
CMD npm run start:prod --silent

eslint.config.mjs

1import prettier from 'eslint-config-prettier';
2
3import apify from '@apify/eslint-config/ts.js';
4import globals from 'globals';
5import tsEslint from 'typescript-eslint';
6
7// eslint-disable-next-line import/no-default-export
8export default [
9    { ignores: ['**/dist', 'eslint.config.mjs'] },
10    ...apify,
11    prettier,
12    {
13        languageOptions: {
14            parser: tsEslint.parser,
15            parserOptions: {
16                project: 'tsconfig.json',
17            },
18            globals: {
19                ...globals.node,
20                ...globals.jest,
21            },
22        },
23        plugins: {
24            '@typescript-eslint': tsEslint.plugin,
25        },
26        rules: {
27            'no-console': 0,
28        },
29    },
30];

package.json

{
	"name": "my-actor",
	"version": "0.0.1",
	"type": "module",
	"description": "This is an example of an Apify Actor.",
	"engines": {
		"node": ">=18.0.0"
	},
	"dependencies": {
		"apify": "^3.4.2",
		"axios": "^1.5.0",
		"cheerio": "^1.0.0-rc.12"
	},
	"devDependencies": {
		"@apify/eslint-config": "^1.0.0",
		"@apify/tsconfig": "^0.1.1",
		"@types/node": "^22.15.32",
		"eslint": "^9.29.0",
		"eslint-config-prettier": "^10.1.5",
		"globals": "^16.2.0",
		"prettier": "^3.5.3",
		"tsx": "^4.20.3",
		"typescript": "^5.8.3",
		"typescript-eslint": "^8.34.1"
	},
	"scripts": {
		"start": "npm run start:dev",
		"start:prod": "node dist/main.js",
		"start:dev": "tsx src/main.ts",
		"build": "tsc",
		"lint": "eslint",
		"lint:fix": "eslint --fix",
		"format": "prettier --write .",
		"format:check": "prettier --check .",
		"test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1"
	},
	"author": "It's not you it's me",
	"license": "ISC"
}

tsconfig.json

{
    "extends": "@apify/tsconfig",
    "compilerOptions": {
        "module": "NodeNext",
        "moduleResolution": "NodeNext",
        "target": "ES2022",
        "outDir": "dist",
        "noUnusedLocals": false,
        "skipLibCheck": true,
        "lib": ["DOM"]
    },
    "include": ["./src/**/*"]
}

Fast Website Content Crawler

6sigmag/fast-website-content-crawler

A high-performance web scraper that rapidly extracts and analyzes content from multiple websites simultaneously. Perfect for competitive research, content aggregation, and website structure analysis.

David Deng

2.3K

4.3

(7)

Fast Google Search Results Scraper

6sigmag/fast-google-search-results-scraper

Paste keywords in bulk → get clean, clickable URLs. This ultra-lightweight Google SERP scraper is built for non-technical teams who need links fast for lead prospecting and market research. No giant payloads, no complex setup

David Deng

5.0

(1)

Fast URL Content Crawler

6sigmag/fast-url-content-crawler

A high-performance web scraper that rapidly extracts and analyzes content from multiple URLs simultaneously. Perfect for competitive research, content aggregation, and website structure analysis.

David Deng

175

5.0

(1)

Video Download Link Crawler(fast and cheap)

thenetaji/video-download-link-crawler

Extract direct video download links from any webpage! Customize crawling with regex, export in multiple formats, and automate video collection. Fast, efficient, and cheap—try now!

The Netaji

5.0

(1)

Website Content Crawler Pro

datascoutapi/website-content-crawler-pro

Crawl websites and extract clean, structured content in Markdown, JSON, or plain text for AI models, LLMs, vector DBs, or RAG pipelines. Fast, reliable, and stealthy, with bulk processing, advanced metadata extraction, and seamless integration with LangChain, LlamaIndex, and AI workflows.

halam

website content crawler

akash9078/website-content-crawler

Powerful website content crawler tool to extract, analyze, and index web pages automatically. Streamline data collection with fast, accurate web scraping technology.

Akash Kumar Naik

Website Content to Markdown for LLM Training

easyapi/website-content-to-markdown-for-llm-training

🚀 Transform web content into clean, LLM-ready Markdown! 📘 Scrape multiple pages, extract main content, and convert to Markdown format. Perfect for AI researchers, data scientists, and LLM developers. Fast, efficient, and customizable. Supercharge your AI training data today! 🌐📝🧠

EasyApi

178

5.0

(2)

Youtube

canadesk/youtube

Collect channels, videos (with comments), shorts, streams, playlists, podcasts and posts from Youtube! Filters included. It's fast and costs little!

Canadesk Support

Youtube Bulk

canadesk/youtube-bulk

No limits and Bulk mode! Collect channels, videos (with comments), shorts, streams, playlists, podcasts and posts from Youtube! Supports a lot of filters. It's fast and costs little!

Canadesk Support

Fast News Scraper

timgreen/fast-news-scraper

Extract full article text and metadata from popular news sites like The New York Times, AP News, Reuters, CNBC, NPR, and Wired. Scrape thousands of articles in just a few minutes.

Tim Green

457

5.0

(2)

Website Content Crawler Fast

Website Content Crawler Fast

.actor/actor.json

.actor/dataset_schema.json

.actor/input_schema.json

.actor/output_schema.json

src/main.ts

.dockerignore

.editorconfig

.gitignore

.prettierignore

.prettierrc

AGENTS.md

Dockerfile

eslint.config.mjs

package.json

tsconfig.json

You might also like

Fast Website Content Crawler

Fast Google Search Results Scraper

Fast URL Content Crawler

Video Download Link Crawler(fast and cheap)

Website Content Crawler Pro

website content crawler

Website Content to Markdown for LLM Training

Youtube

Youtube Bulk

Fast News Scraper

.actor/actor.json

.actor/dataset_schema.json

.actor/input_schema.json

.actor/output_schema.json

src/main.ts

.dockerignore

.editorconfig

.gitignore

.prettierignore

.prettierrc

AGENTS.md

Dockerfile

eslint.config.mjs

package.json

tsconfig.json