feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
422
mcp-servers/knowledge-base/tests/test_chunking.py
Normal file
422
mcp-servers/knowledge-base/tests/test_chunking.py
Normal file
@@ -0,0 +1,422 @@
|
||||
"""Tests for chunking module."""
|
||||
|
||||
|
||||
|
||||
class TestBaseChunker:
|
||||
"""Tests for base chunker functionality."""
|
||||
|
||||
def test_count_tokens(self, settings):
|
||||
"""Test token counting."""
|
||||
from chunking.text import TextChunker
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=400,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
# Simple text should count tokens
|
||||
tokens = chunker.count_tokens("Hello, world!")
|
||||
assert tokens > 0
|
||||
assert tokens < 10 # Should be about 3-4 tokens
|
||||
|
||||
def test_truncate_to_tokens(self, settings):
|
||||
"""Test truncating text to token limit."""
|
||||
from chunking.text import TextChunker
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=400,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
long_text = "word " * 1000
|
||||
truncated = chunker.truncate_to_tokens(long_text, 10)
|
||||
|
||||
assert chunker.count_tokens(truncated) <= 10
|
||||
|
||||
|
||||
class TestCodeChunker:
|
||||
"""Tests for code chunker."""
|
||||
|
||||
def test_chunk_python_code(self, settings, sample_python_code):
|
||||
"""Test chunking Python code."""
|
||||
from chunking.code import CodeChunker
|
||||
from models import ChunkType, FileType
|
||||
|
||||
chunker = CodeChunker(
|
||||
chunk_size=500,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=sample_python_code,
|
||||
source_path="/test/sample.py",
|
||||
file_type=FileType.PYTHON,
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
||||
assert all(c.file_type == FileType.PYTHON for c in chunks)
|
||||
|
||||
def test_preserves_function_boundaries(self, settings):
|
||||
"""Test that chunker preserves function boundaries."""
|
||||
from chunking.code import CodeChunker
|
||||
from models import FileType
|
||||
|
||||
code = '''def function_one():
|
||||
"""First function."""
|
||||
return 1
|
||||
|
||||
def function_two():
|
||||
"""Second function."""
|
||||
return 2
|
||||
'''
|
||||
|
||||
chunker = CodeChunker(
|
||||
chunk_size=100,
|
||||
chunk_overlap=10,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=code,
|
||||
source_path="/test/funcs.py",
|
||||
file_type=FileType.PYTHON,
|
||||
)
|
||||
|
||||
# Each function should ideally be in its own chunk
|
||||
assert len(chunks) >= 1
|
||||
for chunk in chunks:
|
||||
# Check chunks have line numbers
|
||||
assert chunk.start_line is not None
|
||||
assert chunk.end_line is not None
|
||||
assert chunk.start_line <= chunk.end_line
|
||||
|
||||
def test_handles_empty_content(self, settings):
|
||||
"""Test handling empty content."""
|
||||
from chunking.code import CodeChunker
|
||||
|
||||
chunker = CodeChunker(
|
||||
chunk_size=500,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(content="", source_path="/test/empty.py")
|
||||
|
||||
assert chunks == []
|
||||
|
||||
def test_chunk_type_is_code(self, settings):
|
||||
"""Test that chunk_type property returns CODE."""
|
||||
from chunking.code import CodeChunker
|
||||
from models import ChunkType
|
||||
|
||||
chunker = CodeChunker(
|
||||
chunk_size=500,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
assert chunker.chunk_type == ChunkType.CODE
|
||||
|
||||
|
||||
class TestMarkdownChunker:
|
||||
"""Tests for markdown chunker."""
|
||||
|
||||
def test_chunk_markdown(self, settings, sample_markdown):
|
||||
"""Test chunking markdown content."""
|
||||
from chunking.markdown import MarkdownChunker
|
||||
from models import ChunkType, FileType
|
||||
|
||||
chunker = MarkdownChunker(
|
||||
chunk_size=800,
|
||||
chunk_overlap=100,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=sample_markdown,
|
||||
source_path="/test/docs.md",
|
||||
file_type=FileType.MARKDOWN,
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)
|
||||
|
||||
def test_respects_heading_hierarchy(self, settings):
|
||||
"""Test that chunker respects heading hierarchy."""
|
||||
from chunking.markdown import MarkdownChunker
|
||||
|
||||
markdown = '''# Main Title
|
||||
|
||||
Introduction paragraph.
|
||||
|
||||
## Section One
|
||||
|
||||
Content for section one.
|
||||
|
||||
### Subsection
|
||||
|
||||
More detailed content.
|
||||
|
||||
## Section Two
|
||||
|
||||
Content for section two.
|
||||
'''
|
||||
|
||||
chunker = MarkdownChunker(
|
||||
chunk_size=200,
|
||||
chunk_overlap=20,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=markdown,
|
||||
source_path="/test/docs.md",
|
||||
)
|
||||
|
||||
# Should have multiple chunks based on sections
|
||||
assert len(chunks) >= 1
|
||||
# Metadata should include heading context
|
||||
for chunk in chunks:
|
||||
# Chunks should have content
|
||||
assert len(chunk.content) > 0
|
||||
|
||||
def test_handles_code_blocks(self, settings):
|
||||
"""Test handling of code blocks in markdown."""
|
||||
from chunking.markdown import MarkdownChunker
|
||||
|
||||
markdown = '''# Code Example
|
||||
|
||||
Here's some code:
|
||||
|
||||
```python
|
||||
def hello():
|
||||
print("Hello, World!")
|
||||
```
|
||||
|
||||
End of example.
|
||||
'''
|
||||
|
||||
chunker = MarkdownChunker(
|
||||
chunk_size=500,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=markdown,
|
||||
source_path="/test/code.md",
|
||||
)
|
||||
|
||||
# Code blocks should be preserved
|
||||
assert len(chunks) >= 1
|
||||
full_content = " ".join(c.content for c in chunks)
|
||||
assert "```python" in full_content or "def hello" in full_content
|
||||
|
||||
def test_chunk_type_is_markdown(self, settings):
|
||||
"""Test that chunk_type property returns MARKDOWN."""
|
||||
from chunking.markdown import MarkdownChunker
|
||||
from models import ChunkType
|
||||
|
||||
chunker = MarkdownChunker(
|
||||
chunk_size=800,
|
||||
chunk_overlap=100,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
assert chunker.chunk_type == ChunkType.MARKDOWN
|
||||
|
||||
|
||||
class TestTextChunker:
|
||||
"""Tests for text chunker."""
|
||||
|
||||
def test_chunk_text(self, settings, sample_text):
|
||||
"""Test chunking plain text."""
|
||||
from chunking.text import TextChunker
|
||||
from models import ChunkType
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=400,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=sample_text,
|
||||
source_path="/test/docs.txt",
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(c.chunk_type == ChunkType.TEXT for c in chunks)
|
||||
|
||||
def test_respects_paragraph_boundaries(self, settings):
|
||||
"""Test that chunker respects paragraph boundaries."""
|
||||
from chunking.text import TextChunker
|
||||
|
||||
text = '''First paragraph with some content.
|
||||
|
||||
Second paragraph with different content.
|
||||
|
||||
Third paragraph to test chunking behavior.
|
||||
'''
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=100,
|
||||
chunk_overlap=10,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(
|
||||
content=text,
|
||||
source_path="/test/text.txt",
|
||||
)
|
||||
|
||||
assert len(chunks) >= 1
|
||||
|
||||
def test_handles_single_paragraph(self, settings):
|
||||
"""Test handling of single paragraph that fits in one chunk."""
|
||||
from chunking.text import TextChunker
|
||||
|
||||
text = "This is a short paragraph."
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=400,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(content=text, source_path="/test/short.txt")
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == text
|
||||
|
||||
def test_chunk_type_is_text(self, settings):
|
||||
"""Test that chunk_type property returns TEXT."""
|
||||
from chunking.text import TextChunker
|
||||
from models import ChunkType
|
||||
|
||||
chunker = TextChunker(
|
||||
chunk_size=400,
|
||||
chunk_overlap=50,
|
||||
settings=settings,
|
||||
)
|
||||
|
||||
assert chunker.chunk_type == ChunkType.TEXT
|
||||
|
||||
|
||||
class TestChunkerFactory:
|
||||
"""Tests for chunker factory."""
|
||||
|
||||
def test_get_code_chunker(self, settings):
|
||||
"""Test getting code chunker."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from chunking.code import CodeChunker
|
||||
from models import FileType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
chunker = factory.get_chunker(file_type=FileType.PYTHON)
|
||||
|
||||
assert isinstance(chunker, CodeChunker)
|
||||
|
||||
def test_get_markdown_chunker(self, settings):
|
||||
"""Test getting markdown chunker."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from chunking.markdown import MarkdownChunker
|
||||
from models import FileType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
chunker = factory.get_chunker(file_type=FileType.MARKDOWN)
|
||||
|
||||
assert isinstance(chunker, MarkdownChunker)
|
||||
|
||||
def test_get_text_chunker(self, settings):
|
||||
"""Test getting text chunker."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from chunking.text import TextChunker
|
||||
from models import FileType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
chunker = factory.get_chunker(file_type=FileType.TEXT)
|
||||
|
||||
assert isinstance(chunker, TextChunker)
|
||||
|
||||
def test_get_chunker_for_path(self, settings):
|
||||
"""Test getting chunker based on file path."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from chunking.code import CodeChunker
|
||||
from chunking.markdown import MarkdownChunker
|
||||
from models import FileType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
|
||||
chunker, file_type = factory.get_chunker_for_path("/test/file.py")
|
||||
assert isinstance(chunker, CodeChunker)
|
||||
assert file_type == FileType.PYTHON
|
||||
|
||||
chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
|
||||
assert isinstance(chunker, MarkdownChunker)
|
||||
assert file_type == FileType.MARKDOWN
|
||||
|
||||
def test_chunk_content(self, settings, sample_python_code):
|
||||
"""Test chunk_content convenience method."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from models import ChunkType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
|
||||
chunks = factory.chunk_content(
|
||||
content=sample_python_code,
|
||||
source_path="/test/sample.py",
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
||||
|
||||
def test_default_to_text_chunker(self, settings):
|
||||
"""Test defaulting to text chunker."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from chunking.text import TextChunker
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
chunker = factory.get_chunker()
|
||||
|
||||
assert isinstance(chunker, TextChunker)
|
||||
|
||||
def test_chunker_caching(self, settings):
|
||||
"""Test that factory caches chunker instances."""
|
||||
from chunking.base import ChunkerFactory
|
||||
from models import FileType
|
||||
|
||||
factory = ChunkerFactory(settings=settings)
|
||||
|
||||
chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
|
||||
chunker2 = factory.get_chunker(file_type=FileType.PYTHON)
|
||||
|
||||
assert chunker1 is chunker2
|
||||
|
||||
|
||||
class TestGlobalChunkerFactory:
|
||||
"""Tests for global chunker factory."""
|
||||
|
||||
def test_get_chunker_factory_singleton(self):
|
||||
"""Test that get_chunker_factory returns singleton."""
|
||||
from chunking.base import get_chunker_factory, reset_chunker_factory
|
||||
|
||||
reset_chunker_factory()
|
||||
factory1 = get_chunker_factory()
|
||||
factory2 = get_chunker_factory()
|
||||
|
||||
assert factory1 is factory2
|
||||
|
||||
def test_reset_chunker_factory(self):
|
||||
"""Test resetting chunker factory."""
|
||||
from chunking.base import get_chunker_factory, reset_chunker_factory
|
||||
|
||||
factory1 = get_chunker_factory()
|
||||
reset_chunker_factory()
|
||||
factory2 = get_chunker_factory()
|
||||
|
||||
assert factory1 is not factory2
|
||||
Reference in New Issue
Block a user