""" Base chunker implementation. Provides abstract interface and common utilities for content chunking. """ import logging from abc import ABC, abstractmethod from typing import Any import tiktoken from config import Settings, get_settings from exceptions import ChunkingError from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType logger = logging.getLogger(__name__) class BaseChunker(ABC): """ Abstract base class for content chunkers. Subclasses implement specific chunking strategies for different content types (code, markdown, text). """ def __init__( self, chunk_size: int, chunk_overlap: int, settings: Settings | None = None, ) -> None: """ Initialize chunker. Args: chunk_size: Target tokens per chunk chunk_overlap: Token overlap between chunks settings: Application settings """ self._settings = settings or get_settings() self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap # Use cl100k_base encoding (GPT-4/text-embedding-3) self._tokenizer = tiktoken.get_encoding("cl100k_base") def count_tokens(self, text: str) -> int: """Count tokens in text.""" return len(self._tokenizer.encode(text)) def truncate_to_tokens(self, text: str, max_tokens: int) -> str: """Truncate text to max tokens.""" tokens = self._tokenizer.encode(text) if len(tokens) <= max_tokens: return text return self._tokenizer.decode(tokens[:max_tokens]) @abstractmethod def chunk( self, content: str, source_path: str | None = None, file_type: FileType | None = None, metadata: dict[str, Any] | None = None, ) -> list[Chunk]: """ Split content into chunks. Args: content: Content to chunk source_path: Source file path for reference file_type: File type for specialized handling metadata: Additional metadata to include Returns: List of Chunk objects """ pass @property @abstractmethod def chunk_type(self) -> ChunkType: """Get the chunk type this chunker produces.""" pass def _create_chunk( self, content: str, source_path: str | None = None, start_line: int | None = None, end_line: int | None = None, file_type: FileType | None = None, metadata: dict[str, Any] | None = None, ) -> Chunk: """Create a chunk with token count.""" token_count = self.count_tokens(content) return Chunk( content=content, chunk_type=self.chunk_type, file_type=file_type, source_path=source_path, start_line=start_line, end_line=end_line, metadata=metadata or {}, token_count=token_count, ) class ChunkerFactory: """ Factory for creating appropriate chunkers. Selects the best chunker based on file type or content. """ def __init__(self, settings: Settings | None = None) -> None: """Initialize factory.""" self._settings = settings or get_settings() self._chunkers: dict[str, BaseChunker] = {} def _get_code_chunker(self) -> "BaseChunker": """Get or create code chunker.""" from chunking.code import CodeChunker if "code" not in self._chunkers: self._chunkers["code"] = CodeChunker( chunk_size=self._settings.code_chunk_size, chunk_overlap=self._settings.code_chunk_overlap, settings=self._settings, ) return self._chunkers["code"] def _get_markdown_chunker(self) -> "BaseChunker": """Get or create markdown chunker.""" from chunking.markdown import MarkdownChunker if "markdown" not in self._chunkers: self._chunkers["markdown"] = MarkdownChunker( chunk_size=self._settings.markdown_chunk_size, chunk_overlap=self._settings.markdown_chunk_overlap, settings=self._settings, ) return self._chunkers["markdown"] def _get_text_chunker(self) -> "BaseChunker": """Get or create text chunker.""" from chunking.text import TextChunker if "text" not in self._chunkers: self._chunkers["text"] = TextChunker( chunk_size=self._settings.text_chunk_size, chunk_overlap=self._settings.text_chunk_overlap, settings=self._settings, ) return self._chunkers["text"] def get_chunker( self, file_type: FileType | None = None, chunk_type: ChunkType | None = None, ) -> BaseChunker: """ Get appropriate chunker for content type. Args: file_type: File type to chunk chunk_type: Explicit chunk type to use Returns: Appropriate chunker instance """ # If explicit chunk type specified, use it if chunk_type: if chunk_type == ChunkType.CODE: return self._get_code_chunker() elif chunk_type == ChunkType.MARKDOWN: return self._get_markdown_chunker() else: return self._get_text_chunker() # Otherwise, infer from file type if file_type: if file_type == FileType.MARKDOWN: return self._get_markdown_chunker() elif file_type in ( FileType.TEXT, FileType.JSON, FileType.YAML, FileType.TOML, ): return self._get_text_chunker() else: # Code files return self._get_code_chunker() # Default to text chunker return self._get_text_chunker() def get_chunker_for_path( self, source_path: str ) -> tuple[BaseChunker, FileType | None]: """ Get chunker based on file path extension. Args: source_path: File path to chunk Returns: Tuple of (chunker, file_type) """ # Extract extension ext = "" if "." in source_path: ext = "." + source_path.rsplit(".", 1)[-1].lower() file_type = FILE_EXTENSION_MAP.get(ext) chunker = self.get_chunker(file_type=file_type) return chunker, file_type def chunk_content( self, content: str, source_path: str | None = None, file_type: FileType | None = None, chunk_type: ChunkType | None = None, metadata: dict[str, Any] | None = None, ) -> list[Chunk]: """ Chunk content using appropriate strategy. Args: content: Content to chunk source_path: Source file path file_type: File type chunk_type: Explicit chunk type metadata: Additional metadata Returns: List of chunks """ # If we have a source path but no file type, infer it if source_path and not file_type: chunker, file_type = self.get_chunker_for_path(source_path) else: chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type) try: chunks = chunker.chunk( content=content, source_path=source_path, file_type=file_type, metadata=metadata, ) logger.debug( f"Chunked content into {len(chunks)} chunks " f"(type={chunker.chunk_type.value})" ) return chunks except Exception as e: logger.error(f"Chunking error: {e}") raise ChunkingError( message=f"Failed to chunk content: {e}", cause=e, ) # Global chunker factory instance _chunker_factory: ChunkerFactory | None = None def get_chunker_factory() -> ChunkerFactory: """Get the global chunker factory instance.""" global _chunker_factory if _chunker_factory is None: _chunker_factory = ChunkerFactory() return _chunker_factory def reset_chunker_factory() -> None: """Reset the global chunker factory (for testing).""" global _chunker_factory _chunker_factory = None