fast-next-template/mcp-servers/knowledge-base/tests/test_chunking.py

"""Tests for chunking module."""


class TestBaseChunker:
    """Tests for base chunker functionality."""

    def test_count_tokens(self, settings):
        """Test token counting."""
        from chunking.text import TextChunker

        chunker = TextChunker(
            chunk_size=400,
            chunk_overlap=50,
            settings=settings,
        )

        # Simple text should count tokens
        tokens = chunker.count_tokens("Hello, world!")
        assert tokens > 0
        assert tokens < 10  # Should be about 3-4 tokens

    def test_truncate_to_tokens(self, settings):
        """Test truncating text to token limit."""
        from chunking.text import TextChunker

        chunker = TextChunker(
            chunk_size=400,
            chunk_overlap=50,
            settings=settings,
        )

        long_text = "word " * 1000
        truncated = chunker.truncate_to_tokens(long_text, 10)

        assert chunker.count_tokens(truncated) <= 10


class TestCodeChunker:
    """Tests for code chunker."""

    def test_chunk_python_code(self, settings, sample_python_code):
        """Test chunking Python code."""
        from chunking.code import CodeChunker
        from models import ChunkType, FileType

        chunker = CodeChunker(
            chunk_size=500,
            chunk_overlap=50,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=sample_python_code,
            source_path="/test/sample.py",
            file_type=FileType.PYTHON,
        )

        assert len(chunks) > 0
        assert all(c.chunk_type == ChunkType.CODE for c in chunks)
        assert all(c.file_type == FileType.PYTHON for c in chunks)

    def test_preserves_function_boundaries(self, settings):
        """Test that chunker preserves function boundaries."""
        from chunking.code import CodeChunker
        from models import FileType

        code = '''def function_one():
    """First function."""
    return 1

def function_two():
    """Second function."""
    return 2
'''

        chunker = CodeChunker(
            chunk_size=100,
            chunk_overlap=10,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=code,
            source_path="/test/funcs.py",
            file_type=FileType.PYTHON,
        )

        # Each function should ideally be in its own chunk
        assert len(chunks) >= 1
        for chunk in chunks:
            # Check chunks have line numbers
            assert chunk.start_line is not None
            assert chunk.end_line is not None
            assert chunk.start_line <= chunk.end_line

    def test_handles_empty_content(self, settings):
        """Test handling empty content."""
        from chunking.code import CodeChunker

        chunker = CodeChunker(
            chunk_size=500,
            chunk_overlap=50,
            settings=settings,
        )

        chunks = chunker.chunk(content="", source_path="/test/empty.py")

        assert chunks == []

    def test_chunk_type_is_code(self, settings):
        """Test that chunk_type property returns CODE."""
        from chunking.code import CodeChunker
        from models import ChunkType

        chunker = CodeChunker(
            chunk_size=500,
            chunk_overlap=50,
            settings=settings,
        )

        assert chunker.chunk_type == ChunkType.CODE


class TestMarkdownChunker:
    """Tests for markdown chunker."""

    def test_chunk_markdown(self, settings, sample_markdown):
        """Test chunking markdown content."""
        from chunking.markdown import MarkdownChunker
        from models import ChunkType, FileType

        chunker = MarkdownChunker(
            chunk_size=800,
            chunk_overlap=100,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=sample_markdown,
            source_path="/test/docs.md",
            file_type=FileType.MARKDOWN,
        )

        assert len(chunks) > 0
        assert all(c.chunk_type == ChunkType.MARKDOWN for c in chunks)

    def test_respects_heading_hierarchy(self, settings):
        """Test that chunker respects heading hierarchy."""
        from chunking.markdown import MarkdownChunker

        markdown = '''# Main Title

Introduction paragraph.

## Section One

Content for section one.

### Subsection

More detailed content.

## Section Two

Content for section two.
'''

        chunker = MarkdownChunker(
            chunk_size=200,
            chunk_overlap=20,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=markdown,
            source_path="/test/docs.md",
        )

        # Should have multiple chunks based on sections
        assert len(chunks) >= 1
        # Metadata should include heading context
        for chunk in chunks:
            # Chunks should have content
            assert len(chunk.content) > 0

    def test_handles_code_blocks(self, settings):
        """Test handling of code blocks in markdown."""
        from chunking.markdown import MarkdownChunker

        markdown = '''# Code Example

Here's some code:

```python
def hello():
    print("Hello, World!")
```

End of example.
'''

        chunker = MarkdownChunker(
            chunk_size=500,
            chunk_overlap=50,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=markdown,
            source_path="/test/code.md",
        )

        # Code blocks should be preserved
        assert len(chunks) >= 1
        full_content = " ".join(c.content for c in chunks)
        assert "```python" in full_content or "def hello" in full_content

    def test_chunk_type_is_markdown(self, settings):
        """Test that chunk_type property returns MARKDOWN."""
        from chunking.markdown import MarkdownChunker
        from models import ChunkType

        chunker = MarkdownChunker(
            chunk_size=800,
            chunk_overlap=100,
            settings=settings,
        )

        assert chunker.chunk_type == ChunkType.MARKDOWN


class TestTextChunker:
    """Tests for text chunker."""

    def test_chunk_text(self, settings, sample_text):
        """Test chunking plain text."""
        from chunking.text import TextChunker
        from models import ChunkType

        chunker = TextChunker(
            chunk_size=400,
            chunk_overlap=50,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=sample_text,
            source_path="/test/docs.txt",
        )

        assert len(chunks) > 0
        assert all(c.chunk_type == ChunkType.TEXT for c in chunks)

    def test_respects_paragraph_boundaries(self, settings):
        """Test that chunker respects paragraph boundaries."""
        from chunking.text import TextChunker

        text = '''First paragraph with some content.

Second paragraph with different content.

Third paragraph to test chunking behavior.
'''

        chunker = TextChunker(
            chunk_size=100,
            chunk_overlap=10,
            settings=settings,
        )

        chunks = chunker.chunk(
            content=text,
            source_path="/test/text.txt",
        )

        assert len(chunks) >= 1

    def test_handles_single_paragraph(self, settings):
        """Test handling of single paragraph that fits in one chunk."""
        from chunking.text import TextChunker

        text = "This is a short paragraph."

        chunker = TextChunker(
            chunk_size=400,
            chunk_overlap=50,
            settings=settings,
        )

        chunks = chunker.chunk(content=text, source_path="/test/short.txt")

        assert len(chunks) == 1
        assert chunks[0].content == text

    def test_chunk_type_is_text(self, settings):
        """Test that chunk_type property returns TEXT."""
        from chunking.text import TextChunker
        from models import ChunkType

        chunker = TextChunker(
            chunk_size=400,
            chunk_overlap=50,
            settings=settings,
        )

        assert chunker.chunk_type == ChunkType.TEXT


class TestChunkerFactory:
    """Tests for chunker factory."""

    def test_get_code_chunker(self, settings):
        """Test getting code chunker."""
        from chunking.base import ChunkerFactory
        from chunking.code import CodeChunker
        from models import FileType

        factory = ChunkerFactory(settings=settings)
        chunker = factory.get_chunker(file_type=FileType.PYTHON)

        assert isinstance(chunker, CodeChunker)

    def test_get_markdown_chunker(self, settings):
        """Test getting markdown chunker."""
        from chunking.base import ChunkerFactory
        from chunking.markdown import MarkdownChunker
        from models import FileType

        factory = ChunkerFactory(settings=settings)
        chunker = factory.get_chunker(file_type=FileType.MARKDOWN)

        assert isinstance(chunker, MarkdownChunker)

    def test_get_text_chunker(self, settings):
        """Test getting text chunker."""
        from chunking.base import ChunkerFactory
        from chunking.text import TextChunker
        from models import FileType

        factory = ChunkerFactory(settings=settings)
        chunker = factory.get_chunker(file_type=FileType.TEXT)

        assert isinstance(chunker, TextChunker)

    def test_get_chunker_for_path(self, settings):
        """Test getting chunker based on file path."""
        from chunking.base import ChunkerFactory
        from chunking.code import CodeChunker
        from chunking.markdown import MarkdownChunker
        from models import FileType

        factory = ChunkerFactory(settings=settings)

        chunker, file_type = factory.get_chunker_for_path("/test/file.py")
        assert isinstance(chunker, CodeChunker)
        assert file_type == FileType.PYTHON

        chunker, file_type = factory.get_chunker_for_path("/test/docs.md")
        assert isinstance(chunker, MarkdownChunker)
        assert file_type == FileType.MARKDOWN

    def test_chunk_content(self, settings, sample_python_code):
        """Test chunk_content convenience method."""
        from chunking.base import ChunkerFactory
        from models import ChunkType

        factory = ChunkerFactory(settings=settings)

        chunks = factory.chunk_content(
            content=sample_python_code,
            source_path="/test/sample.py",
        )

        assert len(chunks) > 0
        assert all(c.chunk_type == ChunkType.CODE for c in chunks)

    def test_default_to_text_chunker(self, settings):
        """Test defaulting to text chunker."""
        from chunking.base import ChunkerFactory
        from chunking.text import TextChunker

        factory = ChunkerFactory(settings=settings)
        chunker = factory.get_chunker()

        assert isinstance(chunker, TextChunker)

    def test_chunker_caching(self, settings):
        """Test that factory caches chunker instances."""
        from chunking.base import ChunkerFactory
        from models import FileType

        factory = ChunkerFactory(settings=settings)

        chunker1 = factory.get_chunker(file_type=FileType.PYTHON)
        chunker2 = factory.get_chunker(file_type=FileType.PYTHON)

        assert chunker1 is chunker2


class TestGlobalChunkerFactory:
    """Tests for global chunker factory."""

    def test_get_chunker_factory_singleton(self):
        """Test that get_chunker_factory returns singleton."""
        from chunking.base import get_chunker_factory, reset_chunker_factory

        reset_chunker_factory()
        factory1 = get_chunker_factory()
        factory2 = get_chunker_factory()

        assert factory1 is factory2

    def test_reset_chunker_factory(self):
        """Test resetting chunker factory."""
        from chunking.base import get_chunker_factory, reset_chunker_factory

        factory1 = get_chunker_factory()
        reset_chunker_factory()
        factory2 = get_chunker_factory()

        assert factory1 is not factory2