Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
423 lines
12 KiB
Python
423 lines
12 KiB
Python
"""Tests for chunking module."""
|
|
|
|
|
|
|
|
class TestBaseChunker:
|
|
"""Tests for base chunker functionality."""
|
|
|
|
def test_count_tokens(self, settings):
|
|
"""Test token counting."""
|
|
from chunking.text import TextChunker
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
# Simple text should count tokens
|
|
tokens = chunker.count_tokens("Hello, world!")
|
|
assert tokens > 0
|
|
assert tokens < 10 # Should be about 3-4 tokens
|
|
|
|
def test_truncate_to_tokens(self, settings):
|
|
"""Test truncating text to token limit."""
|
|
from chunking.text import TextChunker
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
long_text = "word " * 1000
|
|
truncated = chunker.truncate_to_tokens(long_text, 10)
|
|
|
|
assert chunker.count_tokens(truncated) <= 10
|
|
|
|
|
|
class TestCodeChunker:
|
|
"""Tests for code chunker."""
|
|
|
|
def test_chunk_python_code(self, settings, sample_python_code):
|
|
"""Test chunking Python code."""
|
|
from chunking.code import CodeChunker
|
|
from models import ChunkType, FileType
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_python_code,
|
|
source_path="/test/sample.py",
|
|
file_type=FileType.PYTHON,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
|
assert all(c.file_type == FileType.PYTHON for c in chunks)
|
|
|
|
def test_preserves_function_boundaries(self, settings):
|
|
"""Test that chunker preserves function boundaries."""
|
|
from chunking.code import CodeChunker
|
|
from models import FileType
|
|
|
|
code = '''def function_one():
|
|
"""First function."""
|
|
return 1
|
|
|
|
def function_two():
|
|
"""Second function."""
|
|
return 2
|
|
'''
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=100,
|
|
chunk_overlap=10,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=code,
|
|
source_path="/test/funcs.py",
|
|
file_type=FileType.PYTHON,
|
|
)
|
|
|
|
# Each function should ideally be in its own chunk
|
|
assert len(chunks) >= 1
|
|
for chunk in chunks:
|
|
# Check chunks have line numbers
|
|
assert chunk.start_line is not None
|
|
assert chunk.end_line is not None
|
|
assert chunk.start_line <= chunk.end_line
|
|
|
|
def test_handles_empty_content(self, settings):
|
|
"""Test handling empty content."""
|
|
from chunking.code import CodeChunker
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(content="", source_path="/test/empty.py")
|
|
|
|
assert chunks == []
|
|
|
|
def test_chunk_type_is_code(self, settings):
|
|
"""Test that chunk_type property returns CODE."""
|
|
from chunking.code import CodeChunker
|
|
from models import ChunkType
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.CODE
|
|
|
|
|
|
class TestMarkdownChunker:
|
|
"""Tests for markdown chunker."""
|
|
|
|
def test_chunk_markdown(self, settings, sample_markdown):
|
|
"""Test chunking markdown content."""
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import ChunkType, FileType
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=800,
|
|
chunk_overlap=100,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_markdown,
|
|
source_path="/test/docs.md",
|
|
file_type=FileType.MARKDOWN,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)
|
|
|
|
def test_respects_heading_hierarchy(self, settings):
|
|
"""Test that chunker respects heading hierarchy."""
|
|
from chunking.markdown import MarkdownChunker
|
|
|
|
markdown = '''# Main Title
|
|
|
|
Introduction paragraph.
|
|
|
|
## Section One
|
|
|
|
Content for section one.
|
|
|
|
### Subsection
|
|
|
|
More detailed content.
|
|
|
|
## Section Two
|
|
|
|
Content for section two.
|
|
'''
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=200,
|
|
chunk_overlap=20,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=markdown,
|
|
source_path="/test/docs.md",
|
|
)
|
|
|
|
# Should have multiple chunks based on sections
|
|
assert len(chunks) >= 1
|
|
# Metadata should include heading context
|
|
for chunk in chunks:
|
|
# Chunks should have content
|
|
assert len(chunk.content) > 0
|
|
|
|
def test_handles_code_blocks(self, settings):
|
|
"""Test handling of code blocks in markdown."""
|
|
from chunking.markdown import MarkdownChunker
|
|
|
|
markdown = '''# Code Example
|
|
|
|
Here's some code:
|
|
|
|
```python
|
|
def hello():
|
|
print("Hello, World!")
|
|
```
|
|
|
|
End of example.
|
|
'''
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=markdown,
|
|
source_path="/test/code.md",
|
|
)
|
|
|
|
# Code blocks should be preserved
|
|
assert len(chunks) >= 1
|
|
full_content = " ".join(c.content for c in chunks)
|
|
assert "```python" in full_content or "def hello" in full_content
|
|
|
|
def test_chunk_type_is_markdown(self, settings):
|
|
"""Test that chunk_type property returns MARKDOWN."""
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import ChunkType
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=800,
|
|
chunk_overlap=100,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.MARKDOWN
|
|
|
|
|
|
class TestTextChunker:
|
|
"""Tests for text chunker."""
|
|
|
|
def test_chunk_text(self, settings, sample_text):
|
|
"""Test chunking plain text."""
|
|
from chunking.text import TextChunker
|
|
from models import ChunkType
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_text,
|
|
source_path="/test/docs.txt",
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.TEXT for c in chunks)
|
|
|
|
def test_respects_paragraph_boundaries(self, settings):
|
|
"""Test that chunker respects paragraph boundaries."""
|
|
from chunking.text import TextChunker
|
|
|
|
text = '''First paragraph with some content.
|
|
|
|
Second paragraph with different content.
|
|
|
|
Third paragraph to test chunking behavior.
|
|
'''
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=100,
|
|
chunk_overlap=10,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=text,
|
|
source_path="/test/text.txt",
|
|
)
|
|
|
|
assert len(chunks) >= 1
|
|
|
|
def test_handles_single_paragraph(self, settings):
|
|
"""Test handling of single paragraph that fits in one chunk."""
|
|
from chunking.text import TextChunker
|
|
|
|
text = "This is a short paragraph."
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(content=text, source_path="/test/short.txt")
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0].content == text
|
|
|
|
def test_chunk_type_is_text(self, settings):
|
|
"""Test that chunk_type property returns TEXT."""
|
|
from chunking.text import TextChunker
|
|
from models import ChunkType
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.TEXT
|
|
|
|
|
|
class TestChunkerFactory:
|
|
"""Tests for chunker factory."""
|
|
|
|
def test_get_code_chunker(self, settings):
|
|
"""Test getting code chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.code import CodeChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.PYTHON)
|
|
|
|
assert isinstance(chunker, CodeChunker)
|
|
|
|
def test_get_markdown_chunker(self, settings):
|
|
"""Test getting markdown chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.MARKDOWN)
|
|
|
|
assert isinstance(chunker, MarkdownChunker)
|
|
|
|
def test_get_text_chunker(self, settings):
|
|
"""Test getting text chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.text import TextChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.TEXT)
|
|
|
|
assert isinstance(chunker, TextChunker)
|
|
|
|
def test_get_chunker_for_path(self, settings):
|
|
"""Test getting chunker based on file path."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.code import CodeChunker
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunker, file_type = factory.get_chunker_for_path("/test/file.py")
|
|
assert isinstance(chunker, CodeChunker)
|
|
assert file_type == FileType.PYTHON
|
|
|
|
chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
|
|
assert isinstance(chunker, MarkdownChunker)
|
|
assert file_type == FileType.MARKDOWN
|
|
|
|
def test_chunk_content(self, settings, sample_python_code):
|
|
"""Test chunk_content convenience method."""
|
|
from chunking.base import ChunkerFactory
|
|
from models import ChunkType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunks = factory.chunk_content(
|
|
content=sample_python_code,
|
|
source_path="/test/sample.py",
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
|
|
|
def test_default_to_text_chunker(self, settings):
|
|
"""Test defaulting to text chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.text import TextChunker
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker()
|
|
|
|
assert isinstance(chunker, TextChunker)
|
|
|
|
def test_chunker_caching(self, settings):
|
|
"""Test that factory caches chunker instances."""
|
|
from chunking.base import ChunkerFactory
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
|
|
chunker2 = factory.get_chunker(file_type=FileType.PYTHON)
|
|
|
|
assert chunker1 is chunker2
|
|
|
|
|
|
class TestGlobalChunkerFactory:
|
|
"""Tests for global chunker factory."""
|
|
|
|
def test_get_chunker_factory_singleton(self):
|
|
"""Test that get_chunker_factory returns singleton."""
|
|
from chunking.base import get_chunker_factory, reset_chunker_factory
|
|
|
|
reset_chunker_factory()
|
|
factory1 = get_chunker_factory()
|
|
factory2 = get_chunker_factory()
|
|
|
|
assert factory1 is factory2
|
|
|
|
def test_reset_chunker_factory(self):
|
|
"""Test resetting chunker factory."""
|
|
from chunking.base import get_chunker_factory, reset_chunker_factory
|
|
|
|
factory1 = get_chunker_factory()
|
|
reset_chunker_factory()
|
|
factory2 = get_chunker_factory()
|
|
|
|
assert factory1 is not factory2
|