syndarix/mcp-servers/knowledge-base/chunking/base.py

"""
Base chunker implementation.

Provides abstract interface and common utilities for content chunking.
"""

import logging
from abc import ABC, abstractmethod
from typing import Any

import tiktoken

from config import Settings, get_settings
from exceptions import ChunkingError
from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType

logger = logging.getLogger(__name__)


class BaseChunker(ABC):
    """
    Abstract base class for content chunkers.

    Subclasses implement specific chunking strategies for
    different content types (code, markdown, text).
    """

    def __init__(
        self,
        chunk_size: int,
        chunk_overlap: int,
        settings: Settings | None = None,
    ) -> None:
        """
        Initialize chunker.

        Args:
            chunk_size: Target tokens per chunk
            chunk_overlap: Token overlap between chunks
            settings: Application settings
        """
        self._settings = settings or get_settings()
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Use cl100k_base encoding (GPT-4/text-embedding-3)
        self._tokenizer = tiktoken.get_encoding("cl100k_base")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        return len(self._tokenizer.encode(text))

    def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
        """Truncate text to max tokens."""
        tokens = self._tokenizer.encode(text)
        if len(tokens) <= max_tokens:
            return text
        return self._tokenizer.decode(tokens[:max_tokens])

    @abstractmethod
    def chunk(
        self,
        content: str,
        source_path: str | None = None,
        file_type: FileType | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> list[Chunk]:
        """
        Split content into chunks.

        Args:
            content: Content to chunk
            source_path: Source file path for reference
            file_type: File type for specialized handling
            metadata: Additional metadata to include

        Returns:
            List of Chunk objects
        """
        pass

    @property
    @abstractmethod
    def chunk_type(self) -> ChunkType:
        """Get the chunk type this chunker produces."""
        pass

    def _create_chunk(
        self,
        content: str,
        source_path: str | None = None,
        start_line: int | None = None,
        end_line: int | None = None,
        file_type: FileType | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> Chunk:
        """Create a chunk with token count."""
        token_count = self.count_tokens(content)
        return Chunk(
            content=content,
            chunk_type=self.chunk_type,
            file_type=file_type,
            source_path=source_path,
            start_line=start_line,
            end_line=end_line,
            metadata=metadata or {},
            token_count=token_count,
        )


class ChunkerFactory:
    """
    Factory for creating appropriate chunkers.

    Selects the best chunker based on file type or content.
    """

    def __init__(self, settings: Settings | None = None) -> None:
        """Initialize factory."""
        self._settings = settings or get_settings()
        self._chunkers: dict[str, BaseChunker] = {}

    def _get_code_chunker(self) -> "BaseChunker":
        """Get or create code chunker."""
        from chunking.code import CodeChunker

        if "code" not in self._chunkers:
            self._chunkers["code"] = CodeChunker(
                chunk_size=self._settings.code_chunk_size,
                chunk_overlap=self._settings.code_chunk_overlap,
                settings=self._settings,
            )
        return self._chunkers["code"]

    def _get_markdown_chunker(self) -> "BaseChunker":
        """Get or create markdown chunker."""
        from chunking.markdown import MarkdownChunker

        if "markdown" not in self._chunkers:
            self._chunkers["markdown"] = MarkdownChunker(
                chunk_size=self._settings.markdown_chunk_size,
                chunk_overlap=self._settings.markdown_chunk_overlap,
                settings=self._settings,
            )
        return self._chunkers["markdown"]

    def _get_text_chunker(self) -> "BaseChunker":
        """Get or create text chunker."""
        from chunking.text import TextChunker

        if "text" not in self._chunkers:
            self._chunkers["text"] = TextChunker(
                chunk_size=self._settings.text_chunk_size,
                chunk_overlap=self._settings.text_chunk_overlap,
                settings=self._settings,
            )
        return self._chunkers["text"]

    def get_chunker(
        self,
        file_type: FileType | None = None,
        chunk_type: ChunkType | None = None,
    ) -> BaseChunker:
        """
        Get appropriate chunker for content type.

        Args:
            file_type: File type to chunk
            chunk_type: Explicit chunk type to use

        Returns:
            Appropriate chunker instance
        """
        # If explicit chunk type specified, use it
        if chunk_type:
            if chunk_type == ChunkType.CODE:
                return self._get_code_chunker()
            elif chunk_type == ChunkType.MARKDOWN:
                return self._get_markdown_chunker()
            else:
                return self._get_text_chunker()

        # Otherwise, infer from file type
        if file_type:
            if file_type == FileType.MARKDOWN:
                return self._get_markdown_chunker()
            elif file_type in (
                FileType.TEXT,
                FileType.JSON,
                FileType.YAML,
                FileType.TOML,
            ):
                return self._get_text_chunker()
            else:
                # Code files
                return self._get_code_chunker()

        # Default to text chunker
        return self._get_text_chunker()

    def get_chunker_for_path(
        self, source_path: str
    ) -> tuple[BaseChunker, FileType | None]:
        """
        Get chunker based on file path extension.

        Args:
            source_path: File path to chunk

        Returns:
            Tuple of (chunker, file_type)
        """
        # Extract extension
        ext = ""
        if "." in source_path:
            ext = "." + source_path.rsplit(".", 1)[-1].lower()

        file_type = FILE_EXTENSION_MAP.get(ext)
        chunker = self.get_chunker(file_type=file_type)

        return chunker, file_type

    def chunk_content(
        self,
        content: str,
        source_path: str | None = None,
        file_type: FileType | None = None,
        chunk_type: ChunkType | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> list[Chunk]:
        """
        Chunk content using appropriate strategy.

        Args:
            content: Content to chunk
            source_path: Source file path
            file_type: File type
            chunk_type: Explicit chunk type
            metadata: Additional metadata

        Returns:
            List of chunks
        """
        # If we have a source path but no file type, infer it
        if source_path and not file_type:
            chunker, file_type = self.get_chunker_for_path(source_path)
        else:
            chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type)

        try:
            chunks = chunker.chunk(
                content=content,
                source_path=source_path,
                file_type=file_type,
                metadata=metadata,
            )

            logger.debug(
                f"Chunked content into {len(chunks)} chunks "
                f"(type={chunker.chunk_type.value})"
            )

            return chunks

        except Exception as e:
            logger.error(f"Chunking error: {e}")
            raise ChunkingError(
                message=f"Failed to chunk content: {e}",
                cause=e,
            )


# Global chunker factory instance
_chunker_factory: ChunkerFactory | None = None


def get_chunker_factory() -> ChunkerFactory:
    """Get the global chunker factory instance."""
    global _chunker_factory
    if _chunker_factory is None:
        _chunker_factory = ChunkerFactory()
    return _chunker_factory


def reset_chunker_factory() -> None:
    """Reset the global chunker factory (for testing)."""
    global _chunker_factory
    _chunker_factory = None