forked from cardosofelipe/fast-next-template
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
289 lines
8.3 KiB
Python
289 lines
8.3 KiB
Python
"""
|
|
Base chunker implementation.
|
|
|
|
Provides abstract interface and common utilities for content chunking.
|
|
"""
|
|
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any
|
|
|
|
import tiktoken
|
|
|
|
from config import Settings, get_settings
|
|
from exceptions import ChunkingError
|
|
from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseChunker(ABC):
|
|
"""
|
|
Abstract base class for content chunkers.
|
|
|
|
Subclasses implement specific chunking strategies for
|
|
different content types (code, markdown, text).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int,
|
|
chunk_overlap: int,
|
|
settings: Settings | None = None,
|
|
) -> None:
|
|
"""
|
|
Initialize chunker.
|
|
|
|
Args:
|
|
chunk_size: Target tokens per chunk
|
|
chunk_overlap: Token overlap between chunks
|
|
settings: Application settings
|
|
"""
|
|
self._settings = settings or get_settings()
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
|
|
# Use cl100k_base encoding (GPT-4/text-embedding-3)
|
|
self._tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
|
|
def count_tokens(self, text: str) -> int:
|
|
"""Count tokens in text."""
|
|
return len(self._tokenizer.encode(text))
|
|
|
|
def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
|
|
"""Truncate text to max tokens."""
|
|
tokens = self._tokenizer.encode(text)
|
|
if len(tokens) <= max_tokens:
|
|
return text
|
|
return self._tokenizer.decode(tokens[:max_tokens])
|
|
|
|
@abstractmethod
|
|
def chunk(
|
|
self,
|
|
content: str,
|
|
source_path: str | None = None,
|
|
file_type: FileType | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> list[Chunk]:
|
|
"""
|
|
Split content into chunks.
|
|
|
|
Args:
|
|
content: Content to chunk
|
|
source_path: Source file path for reference
|
|
file_type: File type for specialized handling
|
|
metadata: Additional metadata to include
|
|
|
|
Returns:
|
|
List of Chunk objects
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def chunk_type(self) -> ChunkType:
|
|
"""Get the chunk type this chunker produces."""
|
|
pass
|
|
|
|
def _create_chunk(
|
|
self,
|
|
content: str,
|
|
source_path: str | None = None,
|
|
start_line: int | None = None,
|
|
end_line: int | None = None,
|
|
file_type: FileType | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> Chunk:
|
|
"""Create a chunk with token count."""
|
|
token_count = self.count_tokens(content)
|
|
return Chunk(
|
|
content=content,
|
|
chunk_type=self.chunk_type,
|
|
file_type=file_type,
|
|
source_path=source_path,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
metadata=metadata or {},
|
|
token_count=token_count,
|
|
)
|
|
|
|
|
|
class ChunkerFactory:
|
|
"""
|
|
Factory for creating appropriate chunkers.
|
|
|
|
Selects the best chunker based on file type or content.
|
|
"""
|
|
|
|
def __init__(self, settings: Settings | None = None) -> None:
|
|
"""Initialize factory."""
|
|
self._settings = settings or get_settings()
|
|
self._chunkers: dict[str, BaseChunker] = {}
|
|
|
|
def _get_code_chunker(self) -> "BaseChunker":
|
|
"""Get or create code chunker."""
|
|
from chunking.code import CodeChunker
|
|
|
|
if "code" not in self._chunkers:
|
|
self._chunkers["code"] = CodeChunker(
|
|
chunk_size=self._settings.code_chunk_size,
|
|
chunk_overlap=self._settings.code_chunk_overlap,
|
|
settings=self._settings,
|
|
)
|
|
return self._chunkers["code"]
|
|
|
|
def _get_markdown_chunker(self) -> "BaseChunker":
|
|
"""Get or create markdown chunker."""
|
|
from chunking.markdown import MarkdownChunker
|
|
|
|
if "markdown" not in self._chunkers:
|
|
self._chunkers["markdown"] = MarkdownChunker(
|
|
chunk_size=self._settings.markdown_chunk_size,
|
|
chunk_overlap=self._settings.markdown_chunk_overlap,
|
|
settings=self._settings,
|
|
)
|
|
return self._chunkers["markdown"]
|
|
|
|
def _get_text_chunker(self) -> "BaseChunker":
|
|
"""Get or create text chunker."""
|
|
from chunking.text import TextChunker
|
|
|
|
if "text" not in self._chunkers:
|
|
self._chunkers["text"] = TextChunker(
|
|
chunk_size=self._settings.text_chunk_size,
|
|
chunk_overlap=self._settings.text_chunk_overlap,
|
|
settings=self._settings,
|
|
)
|
|
return self._chunkers["text"]
|
|
|
|
def get_chunker(
|
|
self,
|
|
file_type: FileType | None = None,
|
|
chunk_type: ChunkType | None = None,
|
|
) -> BaseChunker:
|
|
"""
|
|
Get appropriate chunker for content type.
|
|
|
|
Args:
|
|
file_type: File type to chunk
|
|
chunk_type: Explicit chunk type to use
|
|
|
|
Returns:
|
|
Appropriate chunker instance
|
|
"""
|
|
# If explicit chunk type specified, use it
|
|
if chunk_type:
|
|
if chunk_type == ChunkType.CODE:
|
|
return self._get_code_chunker()
|
|
elif chunk_type == ChunkType.MARKDOWN:
|
|
return self._get_markdown_chunker()
|
|
else:
|
|
return self._get_text_chunker()
|
|
|
|
# Otherwise, infer from file type
|
|
if file_type:
|
|
if file_type == FileType.MARKDOWN:
|
|
return self._get_markdown_chunker()
|
|
elif file_type in (
|
|
FileType.TEXT,
|
|
FileType.JSON,
|
|
FileType.YAML,
|
|
FileType.TOML,
|
|
):
|
|
return self._get_text_chunker()
|
|
else:
|
|
# Code files
|
|
return self._get_code_chunker()
|
|
|
|
# Default to text chunker
|
|
return self._get_text_chunker()
|
|
|
|
def get_chunker_for_path(
|
|
self, source_path: str
|
|
) -> tuple[BaseChunker, FileType | None]:
|
|
"""
|
|
Get chunker based on file path extension.
|
|
|
|
Args:
|
|
source_path: File path to chunk
|
|
|
|
Returns:
|
|
Tuple of (chunker, file_type)
|
|
"""
|
|
# Extract extension
|
|
ext = ""
|
|
if "." in source_path:
|
|
ext = "." + source_path.rsplit(".", 1)[-1].lower()
|
|
|
|
file_type = FILE_EXTENSION_MAP.get(ext)
|
|
chunker = self.get_chunker(file_type=file_type)
|
|
|
|
return chunker, file_type
|
|
|
|
def chunk_content(
|
|
self,
|
|
content: str,
|
|
source_path: str | None = None,
|
|
file_type: FileType | None = None,
|
|
chunk_type: ChunkType | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> list[Chunk]:
|
|
"""
|
|
Chunk content using appropriate strategy.
|
|
|
|
Args:
|
|
content: Content to chunk
|
|
source_path: Source file path
|
|
file_type: File type
|
|
chunk_type: Explicit chunk type
|
|
metadata: Additional metadata
|
|
|
|
Returns:
|
|
List of chunks
|
|
"""
|
|
# If we have a source path but no file type, infer it
|
|
if source_path and not file_type:
|
|
chunker, file_type = self.get_chunker_for_path(source_path)
|
|
else:
|
|
chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type)
|
|
|
|
try:
|
|
chunks = chunker.chunk(
|
|
content=content,
|
|
source_path=source_path,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
|
|
logger.debug(
|
|
f"Chunked content into {len(chunks)} chunks "
|
|
f"(type={chunker.chunk_type.value})"
|
|
)
|
|
|
|
return chunks
|
|
|
|
except Exception as e:
|
|
logger.error(f"Chunking error: {e}")
|
|
raise ChunkingError(
|
|
message=f"Failed to chunk content: {e}",
|
|
cause=e,
|
|
)
|
|
|
|
|
|
# Global chunker factory instance
|
|
_chunker_factory: ChunkerFactory | None = None
|
|
|
|
|
|
def get_chunker_factory() -> ChunkerFactory:
|
|
"""Get the global chunker factory instance."""
|
|
global _chunker_factory
|
|
if _chunker_factory is None:
|
|
_chunker_factory = ChunkerFactory()
|
|
return _chunker_factory
|
|
|
|
|
|
def reset_chunker_factory() -> None:
|
|
"""Reset the global chunker factory (for testing)."""
|
|
global _chunker_factory
|
|
_chunker_factory = None
|