"""Tests for chunking module.""" class TestBaseChunker: """Tests for base chunker functionality.""" def test_count_tokens(self, settings): """Test token counting.""" from chunking.text import TextChunker chunker = TextChunker( chunk_size=400, chunk_overlap=50, settings=settings, ) # Simple text should count tokens tokens = chunker.count_tokens("Hello, world!") assert tokens > 0 assert tokens < 10 # Should be about 3-4 tokens def test_truncate_to_tokens(self, settings): """Test truncating text to token limit.""" from chunking.text import TextChunker chunker = TextChunker( chunk_size=400, chunk_overlap=50, settings=settings, ) long_text = "word " * 1000 truncated = chunker.truncate_to_tokens(long_text, 10) assert chunker.count_tokens(truncated) <= 10 class TestCodeChunker: """Tests for code chunker.""" def test_chunk_python_code(self, settings, sample_python_code): """Test chunking Python code.""" from chunking.code import CodeChunker from models import ChunkType, FileType chunker = CodeChunker( chunk_size=500, chunk_overlap=50, settings=settings, ) chunks = chunker.chunk( content=sample_python_code, source_path="/test/sample.py", file_type=FileType.PYTHON, ) assert len(chunks) > 0 assert all(c.chunk_type == ChunkType.CODE for c in chunks) assert all(c.file_type == FileType.PYTHON for c in chunks) def test_preserves_function_boundaries(self, settings): """Test that chunker preserves function boundaries.""" from chunking.code import CodeChunker from models import FileType code = '''def function_one(): """First function.""" return 1 def function_two(): """Second function.""" return 2 ''' chunker = CodeChunker( chunk_size=100, chunk_overlap=10, settings=settings, ) chunks = chunker.chunk( content=code, source_path="/test/funcs.py", file_type=FileType.PYTHON, ) # Each function should ideally be in its own chunk assert len(chunks) >= 1 for chunk in chunks: # Check chunks have line numbers assert chunk.start_line is not None assert chunk.end_line is not None assert chunk.start_line <= chunk.end_line def test_handles_empty_content(self, settings): """Test handling empty content.""" from chunking.code import CodeChunker chunker = CodeChunker( chunk_size=500, chunk_overlap=50, settings=settings, ) chunks = chunker.chunk(content="", source_path="/test/empty.py") assert chunks == [] def test_chunk_type_is_code(self, settings): """Test that chunk_type property returns CODE.""" from chunking.code import CodeChunker from models import ChunkType chunker = CodeChunker( chunk_size=500, chunk_overlap=50, settings=settings, ) assert chunker.chunk_type == ChunkType.CODE class TestMarkdownChunker: """Tests for markdown chunker.""" def test_chunk_markdown(self, settings, sample_markdown): """Test chunking markdown content.""" from chunking.markdown import MarkdownChunker from models import ChunkType, FileType chunker = MarkdownChunker( chunk_size=800, chunk_overlap=100, settings=settings, ) chunks = chunker.chunk( content=sample_markdown, source_path="/test/docs.md", file_type=FileType.MARKDOWN, ) assert len(chunks) > 0 assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks) def test_respects_heading_hierarchy(self, settings): """Test that chunker respects heading hierarchy.""" from chunking.markdown import MarkdownChunker markdown = '''# Main Title Introduction paragraph. ## Section One Content for section one. ### Subsection More detailed content. ## Section Two Content for section two. ''' chunker = MarkdownChunker( chunk_size=200, chunk_overlap=20, settings=settings, ) chunks = chunker.chunk( content=markdown, source_path="/test/docs.md", ) # Should have multiple chunks based on sections assert len(chunks) >= 1 # Metadata should include heading context for chunk in chunks: # Chunks should have content assert len(chunk.content) > 0 def test_handles_code_blocks(self, settings): """Test handling of code blocks in markdown.""" from chunking.markdown import MarkdownChunker markdown = '''# Code Example Here's some code: ```python def hello(): print("Hello, World!") ``` End of example. ''' chunker = MarkdownChunker( chunk_size=500, chunk_overlap=50, settings=settings, ) chunks = chunker.chunk( content=markdown, source_path="/test/code.md", ) # Code blocks should be preserved assert len(chunks) >= 1 full_content = " ".join(c.content for c in chunks) assert "```python" in full_content or "def hello" in full_content def test_chunk_type_is_markdown(self, settings): """Test that chunk_type property returns MARKDOWN.""" from chunking.markdown import MarkdownChunker from models import ChunkType chunker = MarkdownChunker( chunk_size=800, chunk_overlap=100, settings=settings, ) assert chunker.chunk_type == ChunkType.MARKDOWN class TestTextChunker: """Tests for text chunker.""" def test_chunk_text(self, settings, sample_text): """Test chunking plain text.""" from chunking.text import TextChunker from models import ChunkType chunker = TextChunker( chunk_size=400, chunk_overlap=50, settings=settings, ) chunks = chunker.chunk( content=sample_text, source_path="/test/docs.txt", ) assert len(chunks) > 0 assert all(c.chunk_type == ChunkType.TEXT for c in chunks) def test_respects_paragraph_boundaries(self, settings): """Test that chunker respects paragraph boundaries.""" from chunking.text import TextChunker text = '''First paragraph with some content. Second paragraph with different content. Third paragraph to test chunking behavior. ''' chunker = TextChunker( chunk_size=100, chunk_overlap=10, settings=settings, ) chunks = chunker.chunk( content=text, source_path="/test/text.txt", ) assert len(chunks) >= 1 def test_handles_single_paragraph(self, settings): """Test handling of single paragraph that fits in one chunk.""" from chunking.text import TextChunker text = "This is a short paragraph." chunker = TextChunker( chunk_size=400, chunk_overlap=50, settings=settings, ) chunks = chunker.chunk(content=text, source_path="/test/short.txt") assert len(chunks) == 1 assert chunks[0].content == text def test_chunk_type_is_text(self, settings): """Test that chunk_type property returns TEXT.""" from chunking.text import TextChunker from models import ChunkType chunker = TextChunker( chunk_size=400, chunk_overlap=50, settings=settings, ) assert chunker.chunk_type == ChunkType.TEXT class TestChunkerFactory: """Tests for chunker factory.""" def test_get_code_chunker(self, settings): """Test getting code chunker.""" from chunking.base import ChunkerFactory from chunking.code import CodeChunker from models import FileType factory = ChunkerFactory(settings=settings) chunker = factory.get_chunker(file_type=FileType.PYTHON) assert isinstance(chunker, CodeChunker) def test_get_markdown_chunker(self, settings): """Test getting markdown chunker.""" from chunking.base import ChunkerFactory from chunking.markdown import MarkdownChunker from models import FileType factory = ChunkerFactory(settings=settings) chunker = factory.get_chunker(file_type=FileType.MARKDOWN) assert isinstance(chunker, MarkdownChunker) def test_get_text_chunker(self, settings): """Test getting text chunker.""" from chunking.base import ChunkerFactory from chunking.text import TextChunker from models import FileType factory = ChunkerFactory(settings=settings) chunker = factory.get_chunker(file_type=FileType.TEXT) assert isinstance(chunker, TextChunker) def test_get_chunker_for_path(self, settings): """Test getting chunker based on file path.""" from chunking.base import ChunkerFactory from chunking.code import CodeChunker from chunking.markdown import MarkdownChunker from models import FileType factory = ChunkerFactory(settings=settings) chunker, file_type = factory.get_chunker_for_path("/test/file.py") assert isinstance(chunker, CodeChunker) assert file_type == FileType.PYTHON chunker, file_type = factory.get_chunker_for_path("/test/docs.md") assert isinstance(chunker, MarkdownChunker) assert file_type == FileType.MARKDOWN def test_chunk_content(self, settings, sample_python_code): """Test chunk_content convenience method.""" from chunking.base import ChunkerFactory from models import ChunkType factory = ChunkerFactory(settings=settings) chunks = factory.chunk_content( content=sample_python_code, source_path="/test/sample.py", ) assert len(chunks) > 0 assert all(c.chunk_type == ChunkType.CODE for c in chunks) def test_default_to_text_chunker(self, settings): """Test defaulting to text chunker.""" from chunking.base import ChunkerFactory from chunking.text import TextChunker factory = ChunkerFactory(settings=settings) chunker = factory.get_chunker() assert isinstance(chunker, TextChunker) def test_chunker_caching(self, settings): """Test that factory caches chunker instances.""" from chunking.base import ChunkerFactory from models import FileType factory = ChunkerFactory(settings=settings) chunker1 = factory.get_chunker(file_type=FileType.PYTHON) chunker2 = factory.get_chunker(file_type=FileType.PYTHON) assert chunker1 is chunker2 class TestGlobalChunkerFactory: """Tests for global chunker factory.""" def test_get_chunker_factory_singleton(self): """Test that get_chunker_factory returns singleton.""" from chunking.base import get_chunker_factory, reset_chunker_factory reset_chunker_factory() factory1 = get_chunker_factory() factory2 = get_chunker_factory() assert factory1 is factory2 def test_reset_chunker_factory(self): """Test resetting chunker factory.""" from chunking.base import get_chunker_factory, reset_chunker_factory factory1 = get_chunker_factory() reset_chunker_factory() factory2 = get_chunker_factory() assert factory1 is not factory2