You can access the Japanese Website Content Crawler for RAG programmatically from your own applications by using the Apify API. You can also choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.

Python

JavaScript

CLI

OpenAPI

HTTP

MCP

{
  "openapi": "3.0.1",
  "info": {
    "version": "0.1",
    "x-build-id": "Wz7eM7zDGh8FullQ9"
  },
  "servers": [
    {
      "url": "https://api.apify.com/v2"
    }
  ],
  "paths": {
    "/acts/nezha~website-content-crawler-japan/run-sync-get-dataset-items": {
      "post": {
        "operationId": "run-sync-get-dataset-items-nezha-website-content-crawler-japan",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler-japan/runs": {
      "post": {
        "operationId": "runs-sync-nezha-website-content-crawler-japan",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor and returns information about the initiated run in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/runsResponseSchema"
                }
              }
            }
          }
        }
      }
    },
    "/acts/nezha~website-content-crawler-japan/run-sync": {
      "post": {
        "operationId": "run-sync-nezha-website-content-crawler-japan",
        "x-openai-isConsequential": false,
        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
        "tags": [
          "Run Actor"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/inputSchema"
              }
            }
          }
        },
        "parameters": [
          {
            "name": "token",
            "in": "query",
            "required": true,
            "schema": {
              "type": "string"
            },
            "description": "Enter your Apify token here"
          }
        ],
        "responses": {
          "200": {
            "description": "OK"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "inputSchema": {
        "type": "object",
        "required": [
          "startUrls"
        ],
        "properties": {
          "startUrls": {
            "title": "日本語サイト、ドキュメント、ヘルプセンターのURL",
            "type": "array",
            "description": "Markdown、テキスト、HTMLに変換したい日本語ページまたはセクションを貼り付けます。ドキュメントのトップ、ヘルプセンターのカテゴリ、ブログ一覧、製品サイトの一部などを指定できます。Target scope onlyがオンの場合、URLのパスもクロール範囲に使われます。",
            "items": {
              "type": "object",
              "required": [
                "url"
              ],
              "properties": {
                "url": {
                  "type": "string",
                  "title": "URL of a web page",
                  "format": "uri"
                }
              }
            }
          },
          "maxPages": {
            "title": "抽出する最大ページ数",
            "minimum": 1,
            "type": "integer",
            "description": "Datasetに保存する最大ページ数です。最初は高速プレビュー用に3のまま実行し、出力を確認してから増やしてください。実行時間とコストを調整する一番重要な項目です。",
            "default": 3
          },
          "crawlMode": {
            "title": "ページ探索方法",
            "enum": [
              "auto",
              "website",
              "sitemap"
            ],
            "type": "string",
            "description": "Autoはまずsitemap URLを使って高速にページを探し、対象ページが見つからない場合は開始URLからリンクをたどります。セクション単位でクロールしたい場合はWebsite links、sitemapだけに限定したい場合はSitemap onlyを選んでください。",
            "default": "auto"
          },
          "sitemapUrls": {
            "title": "Sitemap URL",
            "type": "array",
            "description": "任意のsitemap.xml URLです。空のままにすると、各開始URLのドメインで/sitemap.xmlを試します。信頼できるsitemapを持つドキュメント、ヘルプセンター、ブログに便利です。",
            "items": {
              "type": "string"
            }
          },
          "maxDepth": {
            "title": "リンクの深さ",
            "minimum": 0,
            "type": "integer",
            "description": "Website linksモードで何階層までリンクをたどるかを指定します。0は貼り付けたURLだけ、1は直接リンクされたページまで、2以上はより広いセクションのクロールに使います。Sitemap onlyモードでは無視されます。",
            "default": 1
          },
          "sameDomainOnly": {
            "title": "対象範囲だけをクロール",
            "type": "boolean",
            "description": "開始URLと同じドメインおよびパス配下だけをクロールします。例: /docsから開始すると/docs配下だけを対象にします。別ドメインやサイト全体まで広げたい場合だけオフにしてください。",
            "default": true
          },
          "outputFormat": {
            "title": "メインコンテンツ形式",
            "enum": [
              "markdown",
              "text",
              "html"
            ],
            "type": "string",
            "description": "Datasetのcontentフィールドに入る形式を指定します。多くのRAGやベクトルDBにはMarkdown、軽量な検索やQAにはプレーンテキスト、独自解析には構造が残るHTMLが向いています。",
            "default": "markdown"
          },
          "saveCleanHtml": {
            "title": "クリーンHTMLを別レコードに保存",
            "type": "boolean",
            "description": "各ページの整形済みHTMLをKey-value storeに保存し、CLEAN_HTML_INDEXに一覧を出力します。高速プレビューではオフのままにし、下流処理でHTMLファイルが必要な場合にオンにしてください。",
            "default": false
          },
          "contentSelector": {
            "title": "本文エリアのCSSセレクタ",
            "type": "string",
            "description": "本文エリアを指定する任意のCSSセレクタです。例: main、article、.docs-content、#content。空のままにするとmain、article、[role=main]、bodyの順に自動検出します。"
          },
          "removeSelectors": {
            "title": "削除するCSSセレクタ",
            "type": "array",
            "description": "抽出前に削除する任意のCSSセレクタです。例: .sidebar、.cookie-banner、.newsletter、.toc、.ads。ナビゲーションや共通パーツが出力に混ざる場合に使います。",
            "items": {
              "type": "string"
            }
          },
          "minTextLength": {
            "title": "最小テキスト長",
            "minimum": 0,
            "type": "integer",
            "description": "抽出テキストがこの文字数未満のページをスキップします。最初の実行では0のままにしてください。空ページ、リダイレクト、一覧ページを除外したい場合に後から増やします。",
            "default": 0
          },
          "includeUrlGlobs": {
            "title": "含めるURLパターン",
            "type": "array",
            "description": "残したいURLを指定する任意のCrawlee globパターンです。例: **/docs/**、**/help/**。クロール範囲をさらに絞りたい場合だけ使います。",
            "items": {
              "type": "string"
            }
          },
          "excludeUrlGlobs": {
            "title": "除外するURLパターン",
            "type": "array",
            "description": "スキップしたいURLを指定する任意のCrawlee globパターンです。例: **/search/**、**/login/**、**?*utm_*、**/*.pdf。",
            "items": {
              "type": "string"
            }
          },
          "excludeFileExtensions": {
            "title": "追加でスキップするファイル拡張子",
            "type": "array",
            "description": "このActorはpdf、画像、動画、Officeファイル、アーカイブなど一般的な非HTMLファイルを自動でスキップします。対象サイトに独自のダウンロード形式がある場合だけ追加してください。",
            "items": {
              "type": "string"
            }
          },
          "waitForSelector": {
            "title": "待機するCSSセレクタ",
            "type": "string",
            "description": "抽出前に表示を待つ任意のセレクタです。例: main、.article-body。初期読み込み後に本文が表示されるJavaScriptサイトで使います。"
          },
          "navigationTimeoutSecs": {
            "title": "ページ読み込みタイムアウト",
            "minimum": 15,
            "type": "integer",
            "description": "ページ遷移と任意セレクタの待機に使う秒数です。JavaScriptが多く遅いサイトでは増やしてください。",
            "default": 25
          },
          "proxyConfiguration": {
            "title": "プロキシ設定",
            "type": "object",
            "description": "任意のApifyプロキシ設定です。公開ドキュメント、ヘルプセンター、ブログでは通常デフォルトの直接接続で十分です。"
          }
        }
      },
      "runsResponseSchema": {
        "type": "object",
        "properties": {
          "data": {
            "type": "object",
            "properties": {
              "id": {
                "type": "string"
              },
              "actId": {
                "type": "string"
              },
              "userId": {
                "type": "string"
              },
              "startedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "finishedAt": {
                "type": "string",
                "format": "date-time",
                "example": "2025-01-08T00:00:00.000Z"
              },
              "status": {
                "type": "string",
                "example": "READY"
              },
              "meta": {
                "type": "object",
                "properties": {
                  "origin": {
                    "type": "string",
                    "example": "API"
                  },
                  "userAgent": {
                    "type": "string"
                  }
                }
              },
              "stats": {
                "type": "object",
                "properties": {
                  "inputBodyLen": {
                    "type": "integer",
                    "example": 2000
                  },
                  "rebootCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "restartCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "resurrectCount": {
                    "type": "integer",
                    "example": 0
                  },
                  "computeUnits": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "options": {
                "type": "object",
                "properties": {
                  "build": {
                    "type": "string",
                    "example": "latest"
                  },
                  "timeoutSecs": {
                    "type": "integer",
                    "example": 300
                  },
                  "memoryMbytes": {
                    "type": "integer",
                    "example": 1024
                  },
                  "diskMbytes": {
                    "type": "integer",
                    "example": 2048
                  }
                }
              },
              "buildId": {
                "type": "string"
              },
              "defaultKeyValueStoreId": {
                "type": "string"
              },
              "defaultDatasetId": {
                "type": "string"
              },
              "defaultRequestQueueId": {
                "type": "string"
              },
              "buildNumber": {
                "type": "string",
                "example": "1.0.0"
              },
              "containerUrl": {
                "type": "string"
              },
              "usage": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "integer",
                    "example": 1
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              },
              "usageTotalUsd": {
                "type": "number",
                "example": 0.00005
              },
              "usageUsd": {
                "type": "object",
                "properties": {
                  "ACTOR_COMPUTE_UNITS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATASET_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "KEY_VALUE_STORE_WRITES": {
                    "type": "number",
                    "example": 0.00005
                  },
                  "KEY_VALUE_STORE_LISTS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_READS": {
                    "type": "integer",
                    "example": 0
                  },
                  "REQUEST_QUEUE_WRITES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_INTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
                    "type": "integer",
                    "example": 0
                  },
                  "PROXY_SERPS": {
                    "type": "integer",
                    "example": 0
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

Japanese Website Content Crawler for RAG | 日本語サイトをMarkdown化 OpenAPI definition

OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.

OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.

By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.

You can download the OpenAPI definitions for Japanese Website Content Crawler for RAG from the options below:

OpenAPI.json

If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.

You can also check out our other API clients:

Japanese Website Content Crawler for RAG API in Python

Japanese Website Content Crawler for RAG API in JavaScript

Japanese Website Content Crawler for RAG API through CLI

Japanese Website Content Crawler for RAG API

Website Crawler API — Markdown for RAG

tugelbay/website-content-crawler

Website crawler API for public pages and clean Markdown, text, or HTML output for RAG pipelines, AI agents, documentation indexing, and monitoring. Guide: https://konabayev.com/tools/website-content-crawler/?utm_source=apify_info&utm_medium=referral&utm_campaign=website-content-crawler

Tugelbay Konabayev

Website Content Crawler

rupom888/website-content-crawler

Syed Rupom

Website to RAG Markdown Crawler

knotted_tussock/rag-markdown-crawler

Crawl any website or docs site and export clean Markdown plus JSONL-style chunks for RAG, LLM apps, and AI agents.

Ralph T

Docs Markdown Rag Ready Crawler

devwithbobby/docs-markdown-rag-ready-crawler

Turn any documentation site or website into clean, structured markdown—ready for RAG, embeddings, and AI agents.

Dev with Bobby

RAG Web Browser

travelmonitorlab/rag-web-browser

Search the web and extract content for AI/RAG pipelines. Returns clean text ready for LLM ingestion.

Travel Monitor Lab

RAG Web Browser

scrapier/rag-web-browser

🌐 RAG Web Browser (rag-web-browser) is an intelligent tool for retrieving and generating answers from web sources with RAG. ⚡ Speed up research, get accurate citations, and streamline workflows for developers & analysts.

Scrapier

Website to Text & Markdown — AI / RAG Content Crawler

inexhaustible_glass/rag-website-crawler

Scrape any website into clean text & Markdown with RAG-ready chunks and token counts for LLMs, vector databases (Pinecone, Qdrant) and AI chatbots. Also extracts linked PDF/Word/Excel. Anti-block, robots.txt-aware. Website-to-text for beginners, full RAG pipeline for pros. CPU only, no API key.