1import unittest
2from types import ModuleType
3from unittest.mock import patch
4
5from pydantic import ValidationError
6
7from src.models import MarkdownSplitterInput
8from src.processing import convert_to_markdown, format_chunks, resolve_markdown_text
9
10
11class MarkdownSplitterInputTests(unittest.TestCase):
12 def test_text_mode_requires_markdown_text(self) -> None:
13 with self.assertRaises(ValidationError):
14 MarkdownSplitterInput(input_mode="text", markdown_text=None)
15
16 def test_file_mode_requires_markdown_file(self) -> None:
17 with self.assertRaises(ValidationError):
18 MarkdownSplitterInput(input_mode="file", markdown_file=None)
19
20 def test_markdown_text_respects_input_limit(self) -> None:
21 with self.assertRaises(ValidationError):
22 MarkdownSplitterInput(markdown_text="x" * 11, max_input_chars=10)
23
24 def test_max_chunk_chars_accepts_positive_values(self) -> None:
25 parsed = MarkdownSplitterInput(markdown_text="# Heading", max_chunk_chars=1800)
26 self.assertEqual(parsed.max_chunk_chars, 1800)
27
28
29class RuntimeHelpersTests(unittest.IsolatedAsyncioTestCase):
30 async def test_resolve_markdown_text_from_text_mode(self) -> None:
31 params = MarkdownSplitterInput(markdown_text="# Heading")
32
33 actual, input_file_type = await resolve_markdown_text(
34 params=params,
35 fetch_remote=lambda _url: (b"# from remote", "text/markdown"),
36 load_kvs=lambda _key: "# from kvs",
37 )
38
39 self.assertEqual(actual, "# Heading")
40 self.assertEqual(input_file_type, "text/markdown")
41
42 async def test_resolve_markdown_text_from_remote_file(self) -> None:
43 params = MarkdownSplitterInput(
44 input_mode="file",
45 markdown_text=None,
46 markdown_file="https://example.com/input.md",
47 )
48
49 actual, input_file_type = await resolve_markdown_text(
50 params=params,
51 fetch_remote=lambda _url: (
52 b"<html><body><h1>Fetched Markdown</h1></body></html>",
53 "text/html",
54 ),
55 load_kvs=lambda _key: "# from kvs",
56 convert_remote_file=lambda _raw, _content_type: "# fetched markdown",
57 )
58
59 self.assertEqual(actual, "# fetched markdown")
60 self.assertEqual(input_file_type, "text/html")
61
62 async def test_resolve_markdown_text_from_kvs_file(self) -> None:
63 params = MarkdownSplitterInput(
64 input_mode="file",
65 markdown_text=None,
66 markdown_file="kvs://MARKDOWN_INPUT",
67 )
68
69 actual, input_file_type = await resolve_markdown_text(
70 params=params,
71 fetch_remote=lambda _url: (b"# remote", "text/markdown"),
72 load_kvs=lambda _key: "# from kvs",
73 )
74
75 self.assertEqual(actual, "# from kvs")
76 self.assertEqual(input_file_type, "text/markdown")
77
78 async def test_resolve_markdown_text_rejects_unsupported_scheme(self) -> None:
79 params = MarkdownSplitterInput(
80 input_mode="file",
81 markdown_text=None,
82 markdown_file="ftp://example.com/input.md",
83 )
84
85 with self.assertRaises(ValueError):
86 await resolve_markdown_text(
87 params=params,
88 fetch_remote=lambda _url: (b"# remote", "text/markdown"),
89 load_kvs=lambda _key: "# from kvs",
90 )
91
92 def test_convert_to_markdown_uses_markitdown_converter(self) -> None:
93 class _Result:
94 text_content = "# Title\n\nBody"
95
96 class _Converter:
97 def convert_stream(self, _stream, mimetype=None):
98 self.mimetype = mimetype
99 return _Result()
100
101 fake_module = ModuleType("markitdown")
102 fake_module.MarkItDown = lambda enable_plugins=False: _Converter()
103
104 with patch.dict("sys.modules", {"markitdown": fake_module}):
105 actual = convert_to_markdown(b"<h1>Title</h1><p>Body</p>", "text/html")
106
107 self.assertEqual(actual, "# Title\n\nBody")
108
109 def test_format_chunks_adds_chunk_metrics(self) -> None:
110 class _Doc:
111 def __init__(self, page_content: str, metadata: dict) -> None:
112 self.page_content = page_content
113 self.metadata = metadata
114
115 chunks = format_chunks([_Doc("abc", {"Header 1": "H1"})])
116
117 self.assertEqual(chunks[0].char_count, 3)
118 self.assertEqual(chunks[0].token_count, 1)
119 self.assertTrue(chunks[0].chunk_id)
120
121
122if __name__ == "__main__":
123 unittest.main()