Files
fast-next-template/mcp-servers/knowledge-base/tests/test_chunking.py
Felipe Cardoso d0fc7f37ff feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search:

- Intelligent chunking strategies (code-aware, markdown-aware, text)
- Semantic search with vector similarity (HNSW index)
- Keyword search with PostgreSQL full-text search
- Hybrid search using Reciprocal Rank Fusion (RRF)
- Redis caching for embeddings
- Collection management (ingest, search, delete, stats)
- FastMCP tools: search_knowledge, ingest_content, delete_content,
  list_collections, get_collection_stats, update_document

Testing:
- 128 comprehensive tests covering all components
- 58% code coverage (database integration tests use mocks)
- Passes ruff linting and mypy type checking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00

423 lines
12 KiB
Python

"""Tests for chunking module."""
class TestBaseChunker:
"""Tests for base chunker functionality."""
def test_count_tokens(self, settings):
"""Test token counting."""
from chunking.text import TextChunker
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
# Simple text should count tokens
tokens = chunker.count_tokens("Hello, world!")
assert tokens > 0
assert tokens < 10 # Should be about 3-4 tokens
def test_truncate_to_tokens(self, settings):
"""Test truncating text to token limit."""
from chunking.text import TextChunker
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
long_text = "word " * 1000
truncated = chunker.truncate_to_tokens(long_text, 10)
assert chunker.count_tokens(truncated) <= 10
class TestCodeChunker:
"""Tests for code chunker."""
def test_chunk_python_code(self, settings, sample_python_code):
"""Test chunking Python code."""
from chunking.code import CodeChunker
from models import ChunkType, FileType
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=sample_python_code,
source_path="/test/sample.py",
file_type=FileType.PYTHON,
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
assert all(c.file_type == FileType.PYTHON for c in chunks)
def test_preserves_function_boundaries(self, settings):
"""Test that chunker preserves function boundaries."""
from chunking.code import CodeChunker
from models import FileType
code = '''def function_one():
"""First function."""
return 1
def function_two():
"""Second function."""
return 2
'''
chunker = CodeChunker(
chunk_size=100,
chunk_overlap=10,
settings=settings,
)
chunks = chunker.chunk(
content=code,
source_path="/test/funcs.py",
file_type=FileType.PYTHON,
)
# Each function should ideally be in its own chunk
assert len(chunks) >= 1
for chunk in chunks:
# Check chunks have line numbers
assert chunk.start_line is not None
assert chunk.end_line is not None
assert chunk.start_line <= chunk.end_line
def test_handles_empty_content(self, settings):
"""Test handling empty content."""
from chunking.code import CodeChunker
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(content="", source_path="/test/empty.py")
assert chunks == []
def test_chunk_type_is_code(self, settings):
"""Test that chunk_type property returns CODE."""
from chunking.code import CodeChunker
from models import ChunkType
chunker = CodeChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
assert chunker.chunk_type == ChunkType.CODE
class TestMarkdownChunker:
"""Tests for markdown chunker."""
def test_chunk_markdown(self, settings, sample_markdown):
"""Test chunking markdown content."""
from chunking.markdown import MarkdownChunker
from models import ChunkType, FileType
chunker = MarkdownChunker(
chunk_size=800,
chunk_overlap=100,
settings=settings,
)
chunks = chunker.chunk(
content=sample_markdown,
source_path="/test/docs.md",
file_type=FileType.MARKDOWN,
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)
def test_respects_heading_hierarchy(self, settings):
"""Test that chunker respects heading hierarchy."""
from chunking.markdown import MarkdownChunker
markdown = '''# Main Title
Introduction paragraph.
## Section One
Content for section one.
### Subsection
More detailed content.
## Section Two
Content for section two.
'''
chunker = MarkdownChunker(
chunk_size=200,
chunk_overlap=20,
settings=settings,
)
chunks = chunker.chunk(
content=markdown,
source_path="/test/docs.md",
)
# Should have multiple chunks based on sections
assert len(chunks) >= 1
# Metadata should include heading context
for chunk in chunks:
# Chunks should have content
assert len(chunk.content) > 0
def test_handles_code_blocks(self, settings):
"""Test handling of code blocks in markdown."""
from chunking.markdown import MarkdownChunker
markdown = '''# Code Example
Here's some code:
```python
def hello():
print("Hello, World!")
```
End of example.
'''
chunker = MarkdownChunker(
chunk_size=500,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=markdown,
source_path="/test/code.md",
)
# Code blocks should be preserved
assert len(chunks) >= 1
full_content = " ".join(c.content for c in chunks)
assert "```python" in full_content or "def hello" in full_content
def test_chunk_type_is_markdown(self, settings):
"""Test that chunk_type property returns MARKDOWN."""
from chunking.markdown import MarkdownChunker
from models import ChunkType
chunker = MarkdownChunker(
chunk_size=800,
chunk_overlap=100,
settings=settings,
)
assert chunker.chunk_type == ChunkType.MARKDOWN
class TestTextChunker:
"""Tests for text chunker."""
def test_chunk_text(self, settings, sample_text):
"""Test chunking plain text."""
from chunking.text import TextChunker
from models import ChunkType
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(
content=sample_text,
source_path="/test/docs.txt",
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.TEXT for c in chunks)
def test_respects_paragraph_boundaries(self, settings):
"""Test that chunker respects paragraph boundaries."""
from chunking.text import TextChunker
text = '''First paragraph with some content.
Second paragraph with different content.
Third paragraph to test chunking behavior.
'''
chunker = TextChunker(
chunk_size=100,
chunk_overlap=10,
settings=settings,
)
chunks = chunker.chunk(
content=text,
source_path="/test/text.txt",
)
assert len(chunks) >= 1
def test_handles_single_paragraph(self, settings):
"""Test handling of single paragraph that fits in one chunk."""
from chunking.text import TextChunker
text = "This is a short paragraph."
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
chunks = chunker.chunk(content=text, source_path="/test/short.txt")
assert len(chunks) == 1
assert chunks[0].content == text
def test_chunk_type_is_text(self, settings):
"""Test that chunk_type property returns TEXT."""
from chunking.text import TextChunker
from models import ChunkType
chunker = TextChunker(
chunk_size=400,
chunk_overlap=50,
settings=settings,
)
assert chunker.chunk_type == ChunkType.TEXT
class TestChunkerFactory:
"""Tests for chunker factory."""
def test_get_code_chunker(self, settings):
"""Test getting code chunker."""
from chunking.base import ChunkerFactory
from chunking.code import CodeChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.PYTHON)
assert isinstance(chunker, CodeChunker)
def test_get_markdown_chunker(self, settings):
"""Test getting markdown chunker."""
from chunking.base import ChunkerFactory
from chunking.markdown import MarkdownChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.MARKDOWN)
assert isinstance(chunker, MarkdownChunker)
def test_get_text_chunker(self, settings):
"""Test getting text chunker."""
from chunking.base import ChunkerFactory
from chunking.text import TextChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker(file_type=FileType.TEXT)
assert isinstance(chunker, TextChunker)
def test_get_chunker_for_path(self, settings):
"""Test getting chunker based on file path."""
from chunking.base import ChunkerFactory
from chunking.code import CodeChunker
from chunking.markdown import MarkdownChunker
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker, file_type = factory.get_chunker_for_path("/test/file.py")
assert isinstance(chunker, CodeChunker)
assert file_type == FileType.PYTHON
chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
assert isinstance(chunker, MarkdownChunker)
assert file_type == FileType.MARKDOWN
def test_chunk_content(self, settings, sample_python_code):
"""Test chunk_content convenience method."""
from chunking.base import ChunkerFactory
from models import ChunkType
factory = ChunkerFactory(settings=settings)
chunks = factory.chunk_content(
content=sample_python_code,
source_path="/test/sample.py",
)
assert len(chunks) > 0
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
def test_default_to_text_chunker(self, settings):
"""Test defaulting to text chunker."""
from chunking.base import ChunkerFactory
from chunking.text import TextChunker
factory = ChunkerFactory(settings=settings)
chunker = factory.get_chunker()
assert isinstance(chunker, TextChunker)
def test_chunker_caching(self, settings):
"""Test that factory caches chunker instances."""
from chunking.base import ChunkerFactory
from models import FileType
factory = ChunkerFactory(settings=settings)
chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
chunker2 = factory.get_chunker(file_type=FileType.PYTHON)
assert chunker1 is chunker2
class TestGlobalChunkerFactory:
"""Tests for global chunker factory."""
def test_get_chunker_factory_singleton(self):
"""Test that get_chunker_factory returns singleton."""
from chunking.base import get_chunker_factory, reset_chunker_factory
reset_chunker_factory()
factory1 = get_chunker_factory()
factory2 = get_chunker_factory()
assert factory1 is factory2
def test_reset_chunker_factory(self):
"""Test resetting chunker factory."""
from chunking.base import get_chunker_factory, reset_chunker_factory
factory1 = get_chunker_factory()
reset_chunker_factory()
factory2 = get_chunker_factory()
assert factory1 is not factory2