Files
syndarix/mcp-servers/knowledge-base/tests/test_chunking.py
Felipe Cardoso 51404216ae refactor(knowledge-base mcp server): adjust formatting for consistency and readability
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
2026-01-06 17:20:31 +01:00

422 lines
12 KiB
Python

"""Tests for chunking module."""
class TestBaseChunker:
"""Tests for base chunker functionality."""
def test_count_tokens(self, settings):
"""Test token counting."""
from chunking.text import TextChunker
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
# Simple text should count tokens
tokens = chunker.count_tokens("Hello, world!")
assert tokens > 0
assert tokens < 10 # Should be about 3-4 tokens
def test_truncate_to_tokens(self, settings):
"""Test truncating text to token limit."""
from chunking.text import TextChunker
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
long_text = "word " * 1000
truncated = chunker.truncate_to_tokens(long_text, 10)
assert chunker.count_tokens(truncated) <= 10
class TestCodeChunker:
"""Tests for code chunker."""
def test_chunk_python_code(self, settings, sample_python_code):
"""Test chunking Python code."""
from chunking.code import CodeChunker
from models import ChunkType, FileType
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=sample_python_code,
source_path="/test/sample.py",
file_type=FileType.PYTHON,
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
assert all(c.file_type == FileType.PYTHON for c in chunks)
def test_preserves_function_boundaries(self, settings):
"""Test that chunker preserves function boundaries."""
from chunking.code import CodeChunker
from models import FileType
code = '''def function_one():
"""First function."""
return 1
def function_two():
"""Second function."""
return 2
'''
chunker = CodeChunker(
chunk_size=100,
chunk_overlap=10,
settings=settings,
)
chunks = chunker.chunk(
content=code,
source_path="/test/funcs.py",
file_type=FileType.PYTHON,
)
# Each function should ideally be in its own chunk
assert len(chunks) >= 1
for chunk in chunks:
# Check chunks have line numbers
assert chunk.start_line is not None
assert chunk.end_line is not None
assert chunk.start_line <= chunk.end_line
def test_handles_empty_content(self, settings):
"""Test handling empty content."""
from chunking.code import CodeChunker
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(content="", source_path="/test/empty.py")
assert chunks == []
def test_chunk_type_is_code(self, settings):
"""Test that chunk_type property returns CODE."""
from chunking.code import CodeChunker
from models import ChunkType
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
assert chunker.chunk_type == ChunkType.CODE
class TestMarkdownChunker:
"""Tests for markdown chunker."""
def test_chunk_markdown(self, settings, sample_markdown):
"""Test chunking markdown content."""
from chunking.markdown import MarkdownChunker
from models import ChunkType, FileType
chunker = MarkdownChunker(
chunk_size=800,
chunk_overlap=100,
settings=settings,
)
chunks = chunker.chunk(
content=sample_markdown,
source_path="/test/docs.md",
file_type=FileType.MARKDOWN,
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)
def test_respects_heading_hierarchy(self, settings):
"""Test that chunker respects heading hierarchy."""
from chunking.markdown import MarkdownChunker
markdown = """# Main Title
Introduction paragraph.
## Section One
Content for section one.
### Subsection
More detailed content.
## Section Two
Content for section two.
"""
chunker = MarkdownChunker(
chunk_size=200,
chunk_overlap=20,
settings=settings,
)
chunks = chunker.chunk(
content=markdown,
source_path="/test/docs.md",
)
# Should have multiple chunks based on sections
assert len(chunks) >= 1
# Metadata should include heading context
for chunk in chunks:
# Chunks should have content
assert len(chunk.content) > 0
def test_handles_code_blocks(self, settings):
"""Test handling of code blocks in markdown."""
from chunking.markdown import MarkdownChunker
markdown = """# Code Example
Here's some code:
```python
def hello():
print("Hello, World!")
```
End of example.
"""
chunker = MarkdownChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=markdown,
source_path="/test/code.md",
)
# Code blocks should be preserved
assert len(chunks) >= 1
full_content = " ".join(c.content for c in chunks)
assert "```python" in full_content or "def hello" in full_content
def test_chunk_type_is_markdown(self, settings):
"""Test that chunk_type property returns MARKDOWN."""
from chunking.markdown import MarkdownChunker
from models import ChunkType
chunker = MarkdownChunker(
chunk_size=800,
chunk_overlap=100,
settings=settings,
)
assert chunker.chunk_type == ChunkType.MARKDOWN
class TestTextChunker:
"""Tests for text chunker."""
def test_chunk_text(self, settings, sample_text):
"""Test chunking plain text."""
from chunking.text import TextChunker
from models import ChunkType
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=sample_text,
source_path="/test/docs.txt",
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.TEXT for c in chunks)
def test_respects_paragraph_boundaries(self, settings):
"""Test that chunker respects paragraph boundaries."""
from chunking.text import TextChunker
text = """First paragraph with some content.
Second paragraph with different content.
Third paragraph to test chunking behavior.
"""
chunker = TextChunker(
chunk_size=100,
chunk_overlap=10,
settings=settings,
)
chunks = chunker.chunk(
content=text,
source_path="/test/text.txt",
)
assert len(chunks) >= 1
def test_handles_single_paragraph(self, settings):
"""Test handling of single paragraph that fits in one chunk."""
from chunking.text import TextChunker
text = "This is a short paragraph."
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(content=text, source_path="/test/short.txt")
assert len(chunks) == 1
assert chunks[0].content == text
def test_chunk_type_is_text(self, settings):
"""Test that chunk_type property returns TEXT."""
from chunking.text import TextChunker
from models import ChunkType
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
assert chunker.chunk_type == ChunkType.TEXT
class TestChunkerFactory:
"""Tests for chunker factory."""
def test_get_code_chunker(self, settings):
"""Test getting code chunker."""
from chunking.base import ChunkerFactory
from chunking.code import CodeChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.PYTHON)
assert isinstance(chunker, CodeChunker)
def test_get_markdown_chunker(self, settings):
"""Test getting markdown chunker."""
from chunking.base import ChunkerFactory
from chunking.markdown import MarkdownChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.MARKDOWN)
assert isinstance(chunker, MarkdownChunker)
def test_get_text_chunker(self, settings):
"""Test getting text chunker."""
from chunking.base import ChunkerFactory
from chunking.text import TextChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.TEXT)
assert isinstance(chunker, TextChunker)
def test_get_chunker_for_path(self, settings):
"""Test getting chunker based on file path."""
from chunking.base import ChunkerFactory
from chunking.code import CodeChunker
from chunking.markdown import MarkdownChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker, file_type = factory.get_chunker_for_path("/test/file.py")
assert isinstance(chunker, CodeChunker)
assert file_type == FileType.PYTHON
chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
assert isinstance(chunker, MarkdownChunker)
assert file_type == FileType.MARKDOWN
def test_chunk_content(self, settings, sample_python_code):
"""Test chunk_content convenience method."""
from chunking.base import ChunkerFactory
from models import ChunkType
factory = ChunkerFactory(settings=settings)
chunks = factory.chunk_content(
content=sample_python_code,
source_path="/test/sample.py",
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
def test_default_to_text_chunker(self, settings):
"""Test defaulting to text chunker."""
from chunking.base import ChunkerFactory
from chunking.text import TextChunker
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker()
assert isinstance(chunker, TextChunker)
def test_chunker_caching(self, settings):
"""Test that factory caches chunker instances."""
from chunking.base import ChunkerFactory
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
chunker2 = factory.get_chunker(file_type=FileType.PYTHON)
assert chunker1 is chunker2
class TestGlobalChunkerFactory:
"""Tests for global chunker factory."""
def test_get_chunker_factory_singleton(self):
"""Test that get_chunker_factory returns singleton."""
from chunking.base import get_chunker_factory, reset_chunker_factory
reset_chunker_factory()
factory1 = get_chunker_factory()
factory2 = get_chunker_factory()
assert factory1 is factory2
def test_reset_chunker_factory(self):
"""Test resetting chunker factory."""
from chunking.base import get_chunker_factory, reset_chunker_factory
factory1 = get_chunker_factory()
reset_chunker_factory()
factory2 = get_chunker_factory()
assert factory1 is not factory2