forked from cardosofelipe/fast-next-template
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
422 lines
12 KiB
Python
422 lines
12 KiB
Python
"""Tests for chunking module."""
|
|
|
|
|
|
class TestBaseChunker:
|
|
"""Tests for base chunker functionality."""
|
|
|
|
def test_count_tokens(self, settings):
|
|
"""Test token counting."""
|
|
from chunking.text import TextChunker
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
# Simple text should count tokens
|
|
tokens = chunker.count_tokens("Hello, world!")
|
|
assert tokens > 0
|
|
assert tokens < 10 # Should be about 3-4 tokens
|
|
|
|
def test_truncate_to_tokens(self, settings):
|
|
"""Test truncating text to token limit."""
|
|
from chunking.text import TextChunker
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
long_text = "word " * 1000
|
|
truncated = chunker.truncate_to_tokens(long_text, 10)
|
|
|
|
assert chunker.count_tokens(truncated) <= 10
|
|
|
|
|
|
class TestCodeChunker:
|
|
"""Tests for code chunker."""
|
|
|
|
def test_chunk_python_code(self, settings, sample_python_code):
|
|
"""Test chunking Python code."""
|
|
from chunking.code import CodeChunker
|
|
from models import ChunkType, FileType
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_python_code,
|
|
source_path="/test/sample.py",
|
|
file_type=FileType.PYTHON,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
|
assert all(c.file_type == FileType.PYTHON for c in chunks)
|
|
|
|
def test_preserves_function_boundaries(self, settings):
|
|
"""Test that chunker preserves function boundaries."""
|
|
from chunking.code import CodeChunker
|
|
from models import FileType
|
|
|
|
code = '''def function_one():
|
|
"""First function."""
|
|
return 1
|
|
|
|
def function_two():
|
|
"""Second function."""
|
|
return 2
|
|
'''
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=100,
|
|
chunk_overlap=10,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=code,
|
|
source_path="/test/funcs.py",
|
|
file_type=FileType.PYTHON,
|
|
)
|
|
|
|
# Each function should ideally be in its own chunk
|
|
assert len(chunks) >= 1
|
|
for chunk in chunks:
|
|
# Check chunks have line numbers
|
|
assert chunk.start_line is not None
|
|
assert chunk.end_line is not None
|
|
assert chunk.start_line <= chunk.end_line
|
|
|
|
def test_handles_empty_content(self, settings):
|
|
"""Test handling empty content."""
|
|
from chunking.code import CodeChunker
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(content="", source_path="/test/empty.py")
|
|
|
|
assert chunks == []
|
|
|
|
def test_chunk_type_is_code(self, settings):
|
|
"""Test that chunk_type property returns CODE."""
|
|
from chunking.code import CodeChunker
|
|
from models import ChunkType
|
|
|
|
chunker = CodeChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.CODE
|
|
|
|
|
|
class TestMarkdownChunker:
|
|
"""Tests for markdown chunker."""
|
|
|
|
def test_chunk_markdown(self, settings, sample_markdown):
|
|
"""Test chunking markdown content."""
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import ChunkType, FileType
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=800,
|
|
chunk_overlap=100,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_markdown,
|
|
source_path="/test/docs.md",
|
|
file_type=FileType.MARKDOWN,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)
|
|
|
|
def test_respects_heading_hierarchy(self, settings):
|
|
"""Test that chunker respects heading hierarchy."""
|
|
from chunking.markdown import MarkdownChunker
|
|
|
|
markdown = """# Main Title
|
|
|
|
Introduction paragraph.
|
|
|
|
## Section One
|
|
|
|
Content for section one.
|
|
|
|
### Subsection
|
|
|
|
More detailed content.
|
|
|
|
## Section Two
|
|
|
|
Content for section two.
|
|
"""
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=200,
|
|
chunk_overlap=20,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=markdown,
|
|
source_path="/test/docs.md",
|
|
)
|
|
|
|
# Should have multiple chunks based on sections
|
|
assert len(chunks) >= 1
|
|
# Metadata should include heading context
|
|
for chunk in chunks:
|
|
# Chunks should have content
|
|
assert len(chunk.content) > 0
|
|
|
|
def test_handles_code_blocks(self, settings):
|
|
"""Test handling of code blocks in markdown."""
|
|
from chunking.markdown import MarkdownChunker
|
|
|
|
markdown = """# Code Example
|
|
|
|
Here's some code:
|
|
|
|
```python
|
|
def hello():
|
|
print("Hello, World!")
|
|
```
|
|
|
|
End of example.
|
|
"""
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=markdown,
|
|
source_path="/test/code.md",
|
|
)
|
|
|
|
# Code blocks should be preserved
|
|
assert len(chunks) >= 1
|
|
full_content = " ".join(c.content for c in chunks)
|
|
assert "```python" in full_content or "def hello" in full_content
|
|
|
|
def test_chunk_type_is_markdown(self, settings):
|
|
"""Test that chunk_type property returns MARKDOWN."""
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import ChunkType
|
|
|
|
chunker = MarkdownChunker(
|
|
chunk_size=800,
|
|
chunk_overlap=100,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.MARKDOWN
|
|
|
|
|
|
class TestTextChunker:
|
|
"""Tests for text chunker."""
|
|
|
|
def test_chunk_text(self, settings, sample_text):
|
|
"""Test chunking plain text."""
|
|
from chunking.text import TextChunker
|
|
from models import ChunkType
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=sample_text,
|
|
source_path="/test/docs.txt",
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.TEXT for c in chunks)
|
|
|
|
def test_respects_paragraph_boundaries(self, settings):
|
|
"""Test that chunker respects paragraph boundaries."""
|
|
from chunking.text import TextChunker
|
|
|
|
text = """First paragraph with some content.
|
|
|
|
Second paragraph with different content.
|
|
|
|
Third paragraph to test chunking behavior.
|
|
"""
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=100,
|
|
chunk_overlap=10,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(
|
|
content=text,
|
|
source_path="/test/text.txt",
|
|
)
|
|
|
|
assert len(chunks) >= 1
|
|
|
|
def test_handles_single_paragraph(self, settings):
|
|
"""Test handling of single paragraph that fits in one chunk."""
|
|
from chunking.text import TextChunker
|
|
|
|
text = "This is a short paragraph."
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
chunks = chunker.chunk(content=text, source_path="/test/short.txt")
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0].content == text
|
|
|
|
def test_chunk_type_is_text(self, settings):
|
|
"""Test that chunk_type property returns TEXT."""
|
|
from chunking.text import TextChunker
|
|
from models import ChunkType
|
|
|
|
chunker = TextChunker(
|
|
chunk_size=400,
|
|
chunk_overlap=50,
|
|
settings=settings,
|
|
)
|
|
|
|
assert chunker.chunk_type == ChunkType.TEXT
|
|
|
|
|
|
class TestChunkerFactory:
|
|
"""Tests for chunker factory."""
|
|
|
|
def test_get_code_chunker(self, settings):
|
|
"""Test getting code chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.code import CodeChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.PYTHON)
|
|
|
|
assert isinstance(chunker, CodeChunker)
|
|
|
|
def test_get_markdown_chunker(self, settings):
|
|
"""Test getting markdown chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.MARKDOWN)
|
|
|
|
assert isinstance(chunker, MarkdownChunker)
|
|
|
|
def test_get_text_chunker(self, settings):
|
|
"""Test getting text chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.text import TextChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker(file_type=FileType.TEXT)
|
|
|
|
assert isinstance(chunker, TextChunker)
|
|
|
|
def test_get_chunker_for_path(self, settings):
|
|
"""Test getting chunker based on file path."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.code import CodeChunker
|
|
from chunking.markdown import MarkdownChunker
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunker, file_type = factory.get_chunker_for_path("/test/file.py")
|
|
assert isinstance(chunker, CodeChunker)
|
|
assert file_type == FileType.PYTHON
|
|
|
|
chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
|
|
assert isinstance(chunker, MarkdownChunker)
|
|
assert file_type == FileType.MARKDOWN
|
|
|
|
def test_chunk_content(self, settings, sample_python_code):
|
|
"""Test chunk_content convenience method."""
|
|
from chunking.base import ChunkerFactory
|
|
from models import ChunkType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunks = factory.chunk_content(
|
|
content=sample_python_code,
|
|
source_path="/test/sample.py",
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(c.chunk_type == ChunkType.CODE for c in chunks)
|
|
|
|
def test_default_to_text_chunker(self, settings):
|
|
"""Test defaulting to text chunker."""
|
|
from chunking.base import ChunkerFactory
|
|
from chunking.text import TextChunker
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
chunker = factory.get_chunker()
|
|
|
|
assert isinstance(chunker, TextChunker)
|
|
|
|
def test_chunker_caching(self, settings):
|
|
"""Test that factory caches chunker instances."""
|
|
from chunking.base import ChunkerFactory
|
|
from models import FileType
|
|
|
|
factory = ChunkerFactory(settings=settings)
|
|
|
|
chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
|
|
chunker2 = factory.get_chunker(file_type=FileType.PYTHON)
|
|
|
|
assert chunker1 is chunker2
|
|
|
|
|
|
class TestGlobalChunkerFactory:
|
|
"""Tests for global chunker factory."""
|
|
|
|
def test_get_chunker_factory_singleton(self):
|
|
"""Test that get_chunker_factory returns singleton."""
|
|
from chunking.base import get_chunker_factory, reset_chunker_factory
|
|
|
|
reset_chunker_factory()
|
|
factory1 = get_chunker_factory()
|
|
factory2 = get_chunker_factory()
|
|
|
|
assert factory1 is factory2
|
|
|
|
def test_reset_chunker_factory(self):
|
|
"""Test resetting chunker factory."""
|
|
from chunking.base import get_chunker_factory, reset_chunker_factory
|
|
|
|
factory1 = get_chunker_factory()
|
|
reset_chunker_factory()
|
|
factory2 = get_chunker_factory()
|
|
|
|
assert factory1 is not factory2
|