Website Content Crawler Fast
Pricing
$4.00 / 1,000 results
Go to Apify Store
Website Content Crawler Fast
Scraping data from every single web page.
5.0 (1)
Pricing
$4.00 / 1,000 results
0
2
2
Last modified
3 days ago
Pricing
$4.00 / 1,000 results
Scraping data from every single web page.
5.0 (1)
Pricing
$4.00 / 1,000 results
0
2
2
Last modified
3 days ago
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in TypeScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "ts-start", "model": "<FILL-IN-MODEL>" }, "input": "./input_schema.json", "dockerfile": "../Dockerfile", "storages": { "dataset": "./dataset_schema.json" }}{ "actorSpecification": 1, "fields": {}, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "level", "text" ] }, "display": { "component": "table", "properties": { "level": { "label": "Level", "format": "text" }, "text": { "label": "Text", "format": "text" } } } } }}{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com" } }, "required": ["url"]}{ "actorOutputSchemaVersion": 1, "title": "Output schema of the files scraper", "description": "This is the output schema for the files scraper", "properties": { "level": { "type": "string", "title": "Level", "template": "{{links.apiDefaultDatasetUrl}}/items?view=level" }, "title": { "type": "string", "title": "Title", "template": "{{links.apiDefaultDatasetUrl}}/items?view=title" } }}1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).2import { Actor } from 'apify';3// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).4import axios from 'axios';5// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).6import * as cheerio from 'cheerio';7
8// this is ESM project, and as such, it requires you to specify extensions in your relative imports9// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions10// note that we need to use `.js` even when inside TS files11// import { router } from './routes.js';12
13// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().14await Actor.init();15
16interface Input {17 url: string;18}19// Structure of input is defined in input_schema.json20const input = await Actor.getInput<Input>();21if (!input) throw new Error('Input is missing!');22const { url } = input;23
24// Fetch the HTML content of the page.25const response = await axios.get(url);26
27// Parse the downloaded HTML with Cheerio to enable data extraction.28const $ = cheerio.load(response.data);29
30// Extract all headings from the page (tag name and text).31const headings: { level: string; text: string }[] = [];32$('h1, h2, h3, h4, h5, h6').each((_i, element) => {33 const headingObject = {34 level: $(element).prop('tagName')!.toLowerCase(),35 text: $(element).text(),36 };37 console.log('Extracted heading', headingObject);38 headings.push(headingObject);39});40
41// Save headings to Dataset - a table-like storage.42await Actor.pushData(headings);43
44// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().45await Actor.exit();# configurations.idea.vscode.zed
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
# dist folderdistroot = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf# This file tells Git which files shouldn't be added to source control
.idea.vscode.zedstorageapify_storagecrawlee_storagenode_modulesdisttsconfig.tsbuildinfostorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
# Added by Apify CLI.venv.prettierignore{ "printWidth": 120, "singleQuote": true, "tabWidth": 4}1# Apify Actors Development Guide2
3Important: Before you begin, fill in the model property in the meta section of `.actor/actor.json`. Replace it with the model you're currently using. This helps Apify monitor and improve AGENTS.md for specific LLM models.4
5## What are Apify Actors?6
7- Actors are serverless programs that run in the cloud. They're inspired by the UNIX philosophy - programs that do one thing well and can be easily combined to build complex systems.8- Actors are programs packaged as Docker images that run in isolated containers9
10## Core Concepts11
12- Accept well-defined JSON input13- Perform isolated tasks (web scraping, automation, data processing)14- Produce structured JSON output to datasets and/or store data in key-value stores15- Can run from seconds to hours or even indefinitely16- Persist state and can be restarted17
18## Do19
20- accept well-defined JSON input and produce structured JSON output21- use Apify SDK (`apify`) for code running ON Apify platform22- validate input early with proper error handling and fail gracefully23- use CheerioCrawler for static HTML content (10x faster than browsers)24- use PlaywrightCrawler only for JavaScript-heavy sites and dynamic content25- use router pattern (createCheerioRouter/createPlaywrightRouter) for complex crawls26- implement retry strategies with exponential backoff for failed requests27- use proper concurrency settings (HTTP: 10-50, Browser: 1-5)28- set sensible defaults in `.actor/input_schema.json` for all optional fields29- set up output schema in `.actor/output_schema.json`30- clean and validate data before pushing to dataset31- use semantic CSS selectors and fallback strategies for missing elements32- respect robots.txt, ToS, and implement rate limiting with delays33- check which tools (cheerio/playwright/crawlee) are installed before applying guidance34
35## Don't36
37- do not rely on `Dataset.getInfo()` for final counts on Cloud platform38- do not use browser crawlers when HTTP/Cheerio works (massive performance gains with HTTP)39- do not hard code values that should be in input schema or environment variables40- do not skip input validation or error handling41- do not overload servers - use appropriate concurrency and delays42- do not scrape prohibited content or ignore Terms of Service43- do not store personal/sensitive data unless explicitly permitted44- do not use deprecated options like `requestHandlerTimeoutMillis` on CheerioCrawler (v3.x)45- do not use `additionalHttpHeaders` - use `preNavigationHooks` instead46
47## Commands48
49```bash50# Local development51apify run # Run Actor locally52
53# Authentication & deployment54apify login # Authenticate account55apify push # Deploy to Apify platform56
57# Help58apify help # List all commands59```60
61## Safety and Permissions62
63Allowed without prompt:64
65- read files with `Actor.getValue()`66- push data with `Actor.pushData()`67- set values with `Actor.setValue()`68- enqueue requests to RequestQueue69- run locally with `apify run`70
71Ask first:72
73- npm/pip package installations74- apify push (deployment to cloud)75- proxy configuration changes (requires paid plan)76- Dockerfile changes affecting builds77- deleting datasets or key-value stores78
79## Project Structure80
81.actor/82├── actor.json # Actor config: name, version, env vars, runtime settings83├── input_schema.json # Input validation & Console form definition84└── output_schema.json # Specifies where an Actor stores its output85src/86└── main.js # Actor entry point and orchestrator87storage/ # Local storage (mirrors Cloud during development)88├── datasets/ # Output items (JSON objects)89├── key_value_stores/ # Files, config, INPUT90└── request_queues/ # Pending crawl requests91Dockerfile # Container image definition92AGENTS.md # AI agent instructions (this file)93
94## Actor Input Schema95
96The input schema defines the input parameters for an Actor. It's a JSON object comprising various field types supported by the Apify platform.97
98### Structure99
100```json101{102 "title": "<INPUT-SCHEMA-TITLE>",103 "type": "object",104 "schemaVersion": 1,105 "properties": {106 /* define input fields here */107 },108 "required": []109}110```111
112### Example113
114```json115{116 "title": "E-commerce Product Scraper Input",117 "type": "object",118 "schemaVersion": 1,119 "properties": {120 "startUrls": {121 "title": "Start URLs",122 "type": "array",123 "description": "URLs to start scraping from (category pages or product pages)",124 "editor": "requestListSources",125 "default": [{ "url": "https://example.com/category" }],126 "prefill": [{ "url": "https://example.com/category" }]127 },128 "followVariants": {129 "title": "Follow Product Variants",130 "type": "boolean",131 "description": "Whether to scrape product variants (different colors, sizes)",132 "default": true133 },134 "maxRequestsPerCrawl": {135 "title": "Max Requests per Crawl",136 "type": "integer",137 "description": "Maximum number of pages to scrape (0 = unlimited)",138 "default": 1000,139 "minimum": 0140 },141 "proxyConfiguration": {142 "title": "Proxy Configuration",143 "type": "object",144 "description": "Proxy settings for anti-bot protection",145 "editor": "proxy",146 "default": { "useApifyProxy": false }147 },148 "locale": {149 "title": "Locale",150 "type": "string",151 "description": "Language/country code for localized content",152 "default": "cs",153 "enum": ["cs", "en", "de", "sk"],154 "enumTitles": ["Czech", "English", "German", "Slovak"]155 }156 },157 "required": ["startUrls"]158}159```160
161## Actor Output Schema162
163The Actor output schema builds upon the schemas for the dataset and key-value store. It specifies where an Actor stores its output and defines templates for accessing that output. Apify Console uses these output definitions to display run results.164
165### Structure166
167```json168{169 "actorOutputSchemaVersion": 1,170 "title": "<OUTPUT-SCHEMA-TITLE>",171 "properties": {172 /* define your outputs here */173 }174}175```176
177### Example178
179```json180{181 "actorOutputSchemaVersion": 1,182 "title": "Output schema of the files scraper",183 "properties": {184 "files": {185 "type": "string",186 "title": "Files",187 "template": "{{links.apiDefaultKeyValueStoreUrl}}/keys"188 },189 "dataset": {190 "type": "string",191 "title": "Dataset",192 "template": "{{links.apiDefaultDatasetUrl}}/items"193 }194 }195}196```197
198### Output Schema Template Variables199
200- `links` (object) - Contains quick links to most commonly used URLs201- `links.publicRunUrl` (string) - Public run url in format `https://console.apify.com/view/runs/:runId`202- `links.consoleRunUrl` (string) - Console run url in format `https://console.apify.com/actors/runs/:runId`203- `links.apiRunUrl` (string) - API run url in format `https://api.apify.com/v2/actor-runs/:runId`204- `links.apiDefaultDatasetUrl` (string) - API url of default dataset in format `https://api.apify.com/v2/datasets/:defaultDatasetId`205- `links.apiDefaultKeyValueStoreUrl` (string) - API url of default key-value store in format `https://api.apify.com/v2/key-value-stores/:defaultKeyValueStoreId`206- `links.containerRunUrl` (string) - URL of a webserver running inside the run in format `https://<containerId>.runs.apify.net/`207- `run` (object) - Contains information about the run same as it is returned from the `GET Run` API endpoint208- `run.defaultDatasetId` (string) - ID of the default dataset209- `run.defaultKeyValueStoreId` (string) - ID of the default key-value store210
211## Dataset Schema Specification212
213The dataset schema defines how your Actor's output data is structured, transformed, and displayed in the Output tab in the Apify Console.214
215### Example216
217Consider an example Actor that calls `Actor.pushData()` to store data into dataset:218
219```typescript220import { Actor } from 'apify';221// Initialize the JavaScript SDK222await Actor.init();223
224/**225 * Actor code226 */227await Actor.pushData({228 numericField: 10,229 pictureUrl: 'https://www.google.com/images/branding/googlelogo/2x/googlelogo_color_92x30dp.png',230 linkUrl: 'https://google.com',231 textField: 'Google',232 booleanField: true,233 dateField: new Date(),234 arrayField: ['#hello', '#world'],235 objectField: {},236});237
238// Exit successfully239await Actor.exit();240```241
242To set up the Actor's output tab UI, reference a dataset schema file in `.actor/actor.json`:243
244```json245{246 "actorSpecification": 1,247 "name": "book-library-scraper",248 "title": "Book Library Scraper",249 "version": "1.0.0",250 "storages": {251 "dataset": "./dataset_schema.json"252 }253}254```255
256Then create the dataset schema in `.actor/dataset_schema.json`:257
258```json259{260 "actorSpecification": 1,261 "fields": {},262 "views": {263 "overview": {264 "title": "Overview",265 "transformation": {266 "fields": [267 "pictureUrl",268 "linkUrl",269 "textField",270 "booleanField",271 "arrayField",272 "objectField",273 "dateField",274 "numericField"275 ]276 },277 "display": {278 "component": "table",279 "properties": {280 "pictureUrl": {281 "label": "Image",282 "format": "image"283 },284 "linkUrl": {285 "label": "Link",286 "format": "link"287 },288 "textField": {289 "label": "Text",290 "format": "text"291 },292 "booleanField": {293 "label": "Boolean",294 "format": "boolean"295 },296 "arrayField": {297 "label": "Array",298 "format": "array"299 },300 "objectField": {301 "label": "Object",302 "format": "object"303 },304 "dateField": {305 "label": "Date",306 "format": "date"307 },308 "numericField": {309 "label": "Number",310 "format": "number"311 }312 }313 }314 }315 }316}317```318
319### Structure320
321```json322{323 "actorSpecification": 1,324 "fields": {},325 "views": {326 "<VIEW_NAME>": {327 "title": "string (required)",328 "description": "string (optional)",329 "transformation": {330 "fields": ["string (required)"],331 "unwind": ["string (optional)"],332 "flatten": ["string (optional)"],333 "omit": ["string (optional)"],334 "limit": "integer (optional)",335 "desc": "boolean (optional)"336 },337 "display": {338 "component": "table (required)",339 "properties": {340 "<FIELD_NAME>": {341 "label": "string (optional)",342 "format": "text|number|date|link|boolean|image|array|object (optional)"343 }344 }345 }346 }347 }348}349```350
351**Dataset Schema Properties:**352
353- `actorSpecification` (integer, required) - Specifies the version of dataset schema structure document (currently only version 1)354- `fields` (JSONSchema object, required) - Schema of one dataset object (use JsonSchema Draft 2020-12 or compatible)355- `views` (DatasetView object, required) - Object with API and UI views description356
357**DatasetView Properties:**358
359- `title` (string, required) - Visible in UI Output tab and API360- `description` (string, optional) - Only available in API response361- `transformation` (ViewTransformation object, required) - Data transformation applied when loading from Dataset API362- `display` (ViewDisplay object, required) - Output tab UI visualization definition363
364**ViewTransformation Properties:**365
366- `fields` (string[], required) - Fields to present in output (order matches column order)367- `unwind` (string[], optional) - Deconstructs nested children into parent object368- `flatten` (string[], optional) - Transforms nested object into flat structure369- `omit` (string[], optional) - Removes specified fields from output370- `limit` (integer, optional) - Maximum number of results (default: all)371- `desc` (boolean, optional) - Sort order (true = newest first)372
373**ViewDisplay Properties:**374
375- `component` (string, required) - Only `table` is available376- `properties` (Object, optional) - Keys matching `transformation.fields` with ViewDisplayProperty values377
378**ViewDisplayProperty Properties:**379
380- `label` (string, optional) - Table column header381- `format` (string, optional) - One of: `text`, `number`, `date`, `link`, `boolean`, `image`, `array`, `object`382
383## Key-Value Store Schema Specification384
385The key-value store schema organizes keys into logical groups called collections for easier data management.386
387### Example388
389Consider an example Actor that calls `Actor.setValue()` to save records into the key-value store:390
391```typescript392import { Actor } from 'apify';393// Initialize the JavaScript SDK394await Actor.init();395
396/**397 * Actor code398 */399await Actor.setValue('document-1', 'my text data', { contentType: 'text/plain' });400
401await Actor.setValue(`image-${imageID}`, imageBuffer, { contentType: 'image/jpeg' });402
403// Exit successfully404await Actor.exit();405```406
407To configure the key-value store schema, reference a schema file in `.actor/actor.json`:408
409```json410{411 "actorSpecification": 1,412 "name": "data-collector",413 "title": "Data Collector",414 "version": "1.0.0",415 "storages": {416 "keyValueStore": "./key_value_store_schema.json"417 }418}419```420
421Then create the key-value store schema in `.actor/key_value_store_schema.json`:422
423```json424{425 "actorKeyValueStoreSchemaVersion": 1,426 "title": "Key-Value Store Schema",427 "collections": {428 "documents": {429 "title": "Documents",430 "description": "Text documents stored by the Actor",431 "keyPrefix": "document-"432 },433 "images": {434 "title": "Images",435 "description": "Images stored by the Actor",436 "keyPrefix": "image-",437 "contentTypes": ["image/jpeg"]438 }439 }440}441```442
443### Structure444
445```json446{447 "actorKeyValueStoreSchemaVersion": 1,448 "title": "string (required)",449 "description": "string (optional)",450 "collections": {451 "<COLLECTION_NAME>": {452 "title": "string (required)",453 "description": "string (optional)",454 "key": "string (conditional - use key OR keyPrefix)",455 "keyPrefix": "string (conditional - use key OR keyPrefix)",456 "contentTypes": ["string (optional)"],457 "jsonSchema": "object (optional)"458 }459 }460}461```462
463**Key-Value Store Schema Properties:**464
465- `actorKeyValueStoreSchemaVersion` (integer, required) - Version of key-value store schema structure document (currently only version 1)466- `title` (string, required) - Title of the schema467- `description` (string, optional) - Description of the schema468- `collections` (Object, required) - Object where each key is a collection ID and value is a Collection object469
470**Collection Properties:**471
472- `title` (string, required) - Collection title shown in UI tabs473- `description` (string, optional) - Description appearing in UI tooltips474- `key` (string, conditional) - Single specific key for this collection475- `keyPrefix` (string, conditional) - Prefix for keys included in this collection476- `contentTypes` (string[], optional) - Allowed content types for validation477- `jsonSchema` (object, optional) - JSON Schema Draft 07 format for `application/json` content type validation478
479Either `key` or `keyPrefix` must be specified for each collection, but not both.480
481## Apify MCP Tools482
483If MCP server is configured, use these tools for documentation:484
485- `search-apify-docs` - Search documentation486- `fetch-apify-docs` - Get full doc pages487
488Otherwise, reference: `@https://mcp.apify.com/`489
490## Resources491
492- [docs.apify.com/llms.txt](https://docs.apify.com/llms.txt) - Quick reference493- [docs.apify.com/llms-full.txt](https://docs.apify.com/llms-full.txt) - Complete docs494- [crawlee.dev](https://crawlee.dev) - Crawlee documentation495- [whitepaper.actor](https://raw.githubusercontent.com/apify/actor-whitepaper/refs/heads/master/README.md) - Complete Actor specification# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:22 AS builder
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install all dependencies. Don't audit to speed up the installation.RUN npm install --include=dev --audit=false
# Next, copy the source files using the user set# in the base image.COPY . ./
# Install all dependencies and build the project.# Don't audit to speed up the installation.RUN npm run build
# Create final imageFROM apify/actor-node:22
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Copy built JS files from builder imageCOPY /usr/src/app/dist ./dist
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm run start:prod --silent1import prettier from 'eslint-config-prettier';2
3import apify from '@apify/eslint-config/ts.js';4import globals from 'globals';5import tsEslint from 'typescript-eslint';6
7// eslint-disable-next-line import/no-default-export8export default [9 { ignores: ['**/dist', 'eslint.config.mjs'] },10 ...apify,11 prettier,12 {13 languageOptions: {14 parser: tsEslint.parser,15 parserOptions: {16 project: 'tsconfig.json',17 },18 globals: {19 ...globals.node,20 ...globals.jest,21 },22 },23 plugins: {24 '@typescript-eslint': tsEslint.plugin,25 },26 rules: {27 'no-console': 0,28 },29 },30];{ "name": "my-actor", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify Actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.4.2", "axios": "^1.5.0", "cheerio": "^1.0.0-rc.12" }, "devDependencies": { "@apify/eslint-config": "^1.0.0", "@apify/tsconfig": "^0.1.1", "@types/node": "^22.15.32", "eslint": "^9.29.0", "eslint-config-prettier": "^10.1.5", "globals": "^16.2.0", "prettier": "^3.5.3", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.34.1" }, "scripts": { "start": "npm run start:dev", "start:prod": "node dist/main.js", "start:dev": "tsx src/main.ts", "build": "tsc", "lint": "eslint", "lint:fix": "eslint --fix", "format": "prettier --write .", "format:check": "prettier --check .", "test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}{ "extends": "@apify/tsconfig", "compilerOptions": { "module": "NodeNext", "moduleResolution": "NodeNext", "target": "ES2022", "outDir": "dist", "noUnusedLocals": false, "skipLibCheck": true, "lib": ["DOM"] }, "include": ["./src/**/*"]}