
Markdown Header Text Splitter
Pricing
$20.00/month + usage
Go to Store

Markdown Header Text Splitter
Split Markdown into structured chunks using header hierarchy. Built with LangChain, it preserves metadata for RAG, documentation, and analysis. Configure headers, strip content, and integrate with vector databases. Ideal for AI workflows.
0.0 (0)
Pricing
$20.00/month + usage
0
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
a month ago
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Added by Apify CLInode_modules
requirements.txt
1annotated-types==0.7.02anyio==4.8.03apify==2.4.04apify_client==1.9.25apify_shared==1.3.16Brotli==1.1.07browserforge==1.2.38cachetools==5.5.29certifi==2025.1.3110cffi==1.17.111charset-normalizer==3.4.112click==8.1.813colorama==0.4.614crawlee==0.6.315cryptography==44.0.216docutils==0.21.217eval_type_backport==0.2.218filelock==3.17.019greenlet==3.1.120h11==0.14.021h2==4.2.022hpack==4.1.023httpcore==1.0.724httpx==0.28.125hyperframe==6.1.026idna==3.1027jsonpatch==1.3328jsonpointer==3.0.029langchain==0.3.2030langchain-core==0.3.4331langchain-text-splitters==0.3.632langsmith==0.3.1333lazy-object-proxy==1.10.034markdown-it-py==3.0.035mdurl==0.1.236more-itertools==10.6.037multidict==6.1.038orjson==3.10.1539packaging==24.240propcache==0.3.041psutil==7.0.042pycparser==2.2243pydantic==2.10.644pydantic-settings==2.6.145pydantic_core==2.27.246pyee==12.1.147Pygments==2.19.148python-dotenv==1.0.149PyYAML==6.0.250requests==2.32.351requests-file==2.1.052requests-toolbelt==1.0.053rich==13.9.454setuptools==76.0.055sniffio==1.3.156sortedcollections==2.1.057sortedcontainers==2.4.058SQLAlchemy==2.0.3859tenacity==9.0.060tldextract==5.1.361typing_extensions==4.12.262urllib3==2.3.063websockets==15.0.164wheel==0.45.165yarl==1.18.366zstandard==0.23.0
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Create and run as a non-root user.RUN useradd --create-home apify && \ chown -R apify:apify ./ && \ chown -R apify:apify /usr/local/lib/python*USER apify
# Specify how to launch the source code of your Actor.# By default, the "python3 -m ." command is runCMD ["python3", "-m", "src"]
.actor/INPUT_SCHEMA.json
{ "title": "Markdown Splitter", "description": "Splits Markdown text into chunks based on headers", "type": "object", "schemaVersion": 1, "properties": { "markdown_text": { "title": "Markdown Text", "type": "string", "description": "The Markdown content to process", "editor": "textarea", "default": "# Header 1\nHello, World!\n## Header 2\nGoodbye, World!" }, "headers_to_split_on": { "title": "Headers to Split On", "type": "array", "description": "Header levels (e.g., ['#', '##'])", "editor": "stringList", "default": ["#", "##", "###", "####", "#####", "######"] }, "strip_headers": { "title": "Strip Headers", "type": "boolean", "description": "Remove headers from chunk content", "default": true } }, "required": ["markdown_text"]}
.actor/actor.json
{ "actorSpecification": 1, "name": "markdown-splitter", "title": "Markdown Header Text Splitter", "description": "Splits Markdown documents into chunks based on header hierarchy", "version": "1.0", "buildTag": "latest", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "fields": { "type": "object", "properties": { "chunks": { "type": "array", "items": { "type": "object", "properties": { "content": { "type": "string" }, "metadata": { "type": "object" } } } }, "error": { "type": "string" } }, "required": ["chunks"] } } }}
src/__init__.py
1
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/main.py
1from apify import Actor2from langchain.text_splitter import MarkdownHeaderTextSplitter3from .models import MarkdownSplitterInput, MarkdownSplitterOutput4
5
6async def main():7 async with Actor:8 try:9 input_data = await Actor.get_input() or {}10 params = MarkdownSplitterInput(**input_data)11
12 # Convert headers to LangChain format13 headers = []14 for h in params.headers_to_split_on:15 level = h.count("#")16 headers.append((h, f"Header {level}"))17
18 # Split Markdown19 splitter = MarkdownHeaderTextSplitter(20 headers_to_split_on=headers, strip_headers=params.strip_headers21 )22 documents = splitter.split_text(params.markdown_text)23
24 # Format output25 chunks = []26 for doc in documents:27 chunks.append({"content": doc.page_content, "metadata": doc.metadata})28
29 await Actor.push_data(30 MarkdownSplitterOutput(chunks=chunks).model_dump(exclude_none=True)31 )32
33 except Exception as e:34 error_msg = f"Actor failed: {str(e)}"35 Actor.log.error(error_msg)36 await Actor.fail(exception=e, status_message=error_msg)
src/models.py
1from pydantic import BaseModel, Field, field_validator2from typing import List, Optional3
4
5class MarkdownSplitterInput(BaseModel):6 markdown_text: str = Field(..., description="Markdown content to split")7 headers_to_split_on: List[str] = Field(8 default=["#", "##", "###", "####", "#####", "######"],9 description="Header levels to split on (e.g., ['#', '##'])",10 )11 strip_headers: bool = Field(True, description="Remove headers from chunks")12
13 @field_validator("headers_to_split_on")14 @classmethod15 def validate_headers(cls, v: List[str]) -> List[str]:16 for h in v:17 stripped = h.strip()18 if (19 not stripped.startswith("#")20 or not stripped.replace("#", "").strip() == ""21 ):22 raise ValueError(f"Invalid header format: {h}. Use '#', '##', etc.")23 return v24
25
26class MarkdownSplitterOutput(BaseModel):27 chunks: List[dict] = Field(..., description="Processed chunks with metadata")28 error: Optional[str] = Field(None, description="Error details")
src/py.typed