""" Plain text chunking implementation. Provides simple text chunking with paragraph and sentence boundary detection. """ import logging import re from typing import Any from chunking.base import BaseChunker from config import Settings from models import Chunk, ChunkType, FileType logger = logging.getLogger(__name__) class TextChunker(BaseChunker): """ Plain text chunker with paragraph awareness. Features: - Splits on paragraph boundaries - Falls back to sentence/word boundaries - Configurable overlap for context preservation """ def __init__( self, chunk_size: int, chunk_overlap: int, settings: Settings | None = None, ) -> None: """Initialize text chunker.""" super().__init__(chunk_size, chunk_overlap, settings) @property def chunk_type(self) -> ChunkType: """Get chunk type.""" return ChunkType.TEXT def chunk( self, content: str, source_path: str | None = None, file_type: FileType | None = None, metadata: dict[str, Any] | None = None, ) -> list[Chunk]: """ Chunk plain text content. Tries paragraph boundaries first, then sentences. """ if not content.strip(): return [] metadata = metadata or {} # Check if content fits in a single chunk total_tokens = self.count_tokens(content) if total_tokens <= self.chunk_size: return [ self._create_chunk( content=content.strip(), source_path=source_path, start_line=1, end_line=content.count("\n") + 1, file_type=file_type, metadata=metadata, ) ] # Try paragraph-based chunking paragraphs = self._split_paragraphs(content) if len(paragraphs) > 1: return self._chunk_by_paragraphs( paragraphs, source_path, file_type, metadata ) # Fall back to sentence-based chunking return self._chunk_by_sentences(content, source_path, file_type, metadata) def _split_paragraphs(self, content: str) -> list[dict[str, Any]]: """Split content into paragraphs.""" paragraphs: list[dict[str, Any]] = [] # Split on double newlines (paragraph boundaries) raw_paras = re.split(r"\n\s*\n", content) line_num = 1 for para in raw_paras: para = para.strip() if not para: continue para_lines = para.count("\n") + 1 paragraphs.append( { "content": para, "tokens": self.count_tokens(para), "start_line": line_num, "end_line": line_num + para_lines - 1, } ) line_num += para_lines + 1 # +1 for blank line between paragraphs return paragraphs def _chunk_by_paragraphs( self, paragraphs: list[dict[str, Any]], source_path: str | None, file_type: FileType | None, metadata: dict[str, Any], ) -> list[Chunk]: """Chunk by combining paragraphs up to size limit.""" chunks: list[Chunk] = [] current_paras: list[str] = [] current_tokens = 0 chunk_start = paragraphs[0]["start_line"] if paragraphs else 1 chunk_end = chunk_start for para in paragraphs: para_content = para["content"] para_tokens = para["tokens"] # Handle paragraphs larger than chunk size if para_tokens > self.chunk_size: # Flush current content if current_paras: chunk_text = "\n\n".join(current_paras) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=chunk_start, end_line=chunk_end, file_type=file_type, metadata=metadata, ) ) current_paras = [] current_tokens = 0 # Split large paragraph sub_chunks = self._split_large_text( para_content, source_path, file_type, metadata, para["start_line"], ) chunks.extend(sub_chunks) chunk_start = para["end_line"] + 1 chunk_end = chunk_start continue # Check if adding paragraph exceeds limit if current_tokens + para_tokens > self.chunk_size and current_paras: chunk_text = "\n\n".join(current_paras) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=chunk_start, end_line=chunk_end, file_type=file_type, metadata=metadata, ) ) # Overlap: keep last paragraph if small enough overlap_para = None if ( current_paras and self.count_tokens(current_paras[-1]) <= self.chunk_overlap ): overlap_para = current_paras[-1] current_paras = [overlap_para] if overlap_para else [] current_tokens = self.count_tokens(overlap_para) if overlap_para else 0 chunk_start = para["start_line"] current_paras.append(para_content) current_tokens += para_tokens chunk_end = para["end_line"] # Final chunk if current_paras: chunk_text = "\n\n".join(current_paras) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=chunk_start, end_line=chunk_end, file_type=file_type, metadata=metadata, ) ) return chunks def _chunk_by_sentences( self, content: str, source_path: str | None, file_type: FileType | None, metadata: dict[str, Any], ) -> list[Chunk]: """Chunk by sentences.""" sentences = self._split_sentences(content) if not sentences: return [] chunks: list[Chunk] = [] current_sentences: list[str] = [] current_tokens = 0 for sentence in sentences: sentence_tokens = self.count_tokens(sentence) # Handle sentences larger than chunk size if sentence_tokens > self.chunk_size: if current_sentences: chunk_text = " ".join(current_sentences) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=1, end_line=1, file_type=file_type, metadata=metadata, ) ) current_sentences = [] current_tokens = 0 # Truncate large sentence truncated = self.truncate_to_tokens(sentence, self.chunk_size) chunks.append( self._create_chunk( content=truncated, source_path=source_path, start_line=1, end_line=1, file_type=file_type, metadata={**metadata, "truncated": True}, ) ) continue # Check if adding sentence exceeds limit if current_tokens + sentence_tokens > self.chunk_size and current_sentences: chunk_text = " ".join(current_sentences) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=1, end_line=1, file_type=file_type, metadata=metadata, ) ) # Overlap: keep last sentence if small enough overlap = None if ( current_sentences and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap ): overlap = current_sentences[-1] current_sentences = [overlap] if overlap else [] current_tokens = self.count_tokens(overlap) if overlap else 0 current_sentences.append(sentence) current_tokens += sentence_tokens # Final chunk if current_sentences: chunk_text = " ".join(current_sentences) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=1, end_line=content.count("\n") + 1, file_type=file_type, metadata=metadata, ) ) return chunks def _split_sentences(self, text: str) -> list[str]: """Split text into sentences.""" # Handle common sentence endings # This is a simple approach - production might use nltk or spacy sentence_pattern = re.compile( r"(?<=[.!?])\s+(?=[A-Z])|" # Standard sentence ending r"(?<=[.!?])\s*$|" # End of text r"(?<=\n)\s*(?=\S)" # Newlines as boundaries ) sentences = sentence_pattern.split(text) return [s.strip() for s in sentences if s.strip()] def _split_large_text( self, text: str, source_path: str | None, file_type: FileType | None, metadata: dict[str, Any], base_line: int, ) -> list[Chunk]: """Split text that exceeds chunk size.""" # First try sentences sentences = self._split_sentences(text) if len(sentences) > 1: return self._chunk_by_sentences(text, source_path, file_type, metadata) # Fall back to word-based splitting return self._chunk_by_words(text, source_path, file_type, metadata, base_line) def _chunk_by_words( self, text: str, source_path: str | None, file_type: FileType | None, metadata: dict[str, Any], base_line: int, ) -> list[Chunk]: """Last resort: chunk by words.""" words = text.split() chunks: list[Chunk] = [] current_words: list[str] = [] current_tokens = 0 for word in words: word_tokens = self.count_tokens(word + " ") if current_tokens + word_tokens > self.chunk_size and current_words: chunk_text = " ".join(current_words) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=base_line, end_line=base_line, file_type=file_type, metadata=metadata, ) ) # Word overlap overlap_count = 0 overlap_words: list[str] = [] for w in reversed(current_words): w_tokens = self.count_tokens(w + " ") if overlap_count + w_tokens > self.chunk_overlap: break overlap_words.insert(0, w) overlap_count += w_tokens current_words = overlap_words current_tokens = overlap_count current_words.append(word) current_tokens += word_tokens # Final chunk if current_words: chunk_text = " ".join(current_words) chunks.append( self._create_chunk( content=chunk_text, source_path=source_path, start_line=base_line, end_line=base_line, file_type=file_type, metadata=metadata, ) ) return chunks