forked from cardosofelipe/fast-next-template
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
392 lines
13 KiB
Python
392 lines
13 KiB
Python
"""
|
|
Plain text chunking implementation.
|
|
|
|
Provides simple text chunking with paragraph and sentence
|
|
boundary detection.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any
|
|
|
|
from chunking.base import BaseChunker
|
|
from config import Settings
|
|
from models import Chunk, ChunkType, FileType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TextChunker(BaseChunker):
|
|
"""
|
|
Plain text chunker with paragraph awareness.
|
|
|
|
Features:
|
|
- Splits on paragraph boundaries
|
|
- Falls back to sentence/word boundaries
|
|
- Configurable overlap for context preservation
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int,
|
|
chunk_overlap: int,
|
|
settings: Settings | None = None,
|
|
) -> None:
|
|
"""Initialize text chunker."""
|
|
super().__init__(chunk_size, chunk_overlap, settings)
|
|
|
|
@property
|
|
def chunk_type(self) -> ChunkType:
|
|
"""Get chunk type."""
|
|
return ChunkType.TEXT
|
|
|
|
def chunk(
|
|
self,
|
|
content: str,
|
|
source_path: str | None = None,
|
|
file_type: FileType | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> list[Chunk]:
|
|
"""
|
|
Chunk plain text content.
|
|
|
|
Tries paragraph boundaries first, then sentences.
|
|
"""
|
|
if not content.strip():
|
|
return []
|
|
|
|
metadata = metadata or {}
|
|
|
|
# Check if content fits in a single chunk
|
|
total_tokens = self.count_tokens(content)
|
|
if total_tokens <= self.chunk_size:
|
|
return [
|
|
self._create_chunk(
|
|
content=content.strip(),
|
|
source_path=source_path,
|
|
start_line=1,
|
|
end_line=content.count("\n") + 1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
]
|
|
|
|
# Try paragraph-based chunking
|
|
paragraphs = self._split_paragraphs(content)
|
|
if len(paragraphs) > 1:
|
|
return self._chunk_by_paragraphs(
|
|
paragraphs, source_path, file_type, metadata
|
|
)
|
|
|
|
# Fall back to sentence-based chunking
|
|
return self._chunk_by_sentences(content, source_path, file_type, metadata)
|
|
|
|
def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
|
|
"""Split content into paragraphs."""
|
|
paragraphs: list[dict[str, Any]] = []
|
|
|
|
# Split on double newlines (paragraph boundaries)
|
|
raw_paras = re.split(r"\n\s*\n", content)
|
|
|
|
line_num = 1
|
|
for para in raw_paras:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
|
|
para_lines = para.count("\n") + 1
|
|
paragraphs.append(
|
|
{
|
|
"content": para,
|
|
"tokens": self.count_tokens(para),
|
|
"start_line": line_num,
|
|
"end_line": line_num + para_lines - 1,
|
|
}
|
|
)
|
|
line_num += para_lines + 1 # +1 for blank line between paragraphs
|
|
|
|
return paragraphs
|
|
|
|
def _chunk_by_paragraphs(
|
|
self,
|
|
paragraphs: list[dict[str, Any]],
|
|
source_path: str | None,
|
|
file_type: FileType | None,
|
|
metadata: dict[str, Any],
|
|
) -> list[Chunk]:
|
|
"""Chunk by combining paragraphs up to size limit."""
|
|
chunks: list[Chunk] = []
|
|
current_paras: list[str] = []
|
|
current_tokens = 0
|
|
chunk_start = paragraphs[0]["start_line"] if paragraphs else 1
|
|
chunk_end = chunk_start
|
|
|
|
for para in paragraphs:
|
|
para_content = para["content"]
|
|
para_tokens = para["tokens"]
|
|
|
|
# Handle paragraphs larger than chunk size
|
|
if para_tokens > self.chunk_size:
|
|
# Flush current content
|
|
if current_paras:
|
|
chunk_text = "\n\n".join(current_paras)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=chunk_start,
|
|
end_line=chunk_end,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
current_paras = []
|
|
current_tokens = 0
|
|
|
|
# Split large paragraph
|
|
sub_chunks = self._split_large_text(
|
|
para_content,
|
|
source_path,
|
|
file_type,
|
|
metadata,
|
|
para["start_line"],
|
|
)
|
|
chunks.extend(sub_chunks)
|
|
chunk_start = para["end_line"] + 1
|
|
chunk_end = chunk_start
|
|
continue
|
|
|
|
# Check if adding paragraph exceeds limit
|
|
if current_tokens + para_tokens > self.chunk_size and current_paras:
|
|
chunk_text = "\n\n".join(current_paras)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=chunk_start,
|
|
end_line=chunk_end,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
# Overlap: keep last paragraph if small enough
|
|
overlap_para = None
|
|
if (
|
|
current_paras
|
|
and self.count_tokens(current_paras[-1]) <= self.chunk_overlap
|
|
):
|
|
overlap_para = current_paras[-1]
|
|
|
|
current_paras = [overlap_para] if overlap_para else []
|
|
current_tokens = self.count_tokens(overlap_para) if overlap_para else 0
|
|
chunk_start = para["start_line"]
|
|
|
|
current_paras.append(para_content)
|
|
current_tokens += para_tokens
|
|
chunk_end = para["end_line"]
|
|
|
|
# Final chunk
|
|
if current_paras:
|
|
chunk_text = "\n\n".join(current_paras)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=chunk_start,
|
|
end_line=chunk_end,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return chunks
|
|
|
|
def _chunk_by_sentences(
|
|
self,
|
|
content: str,
|
|
source_path: str | None,
|
|
file_type: FileType | None,
|
|
metadata: dict[str, Any],
|
|
) -> list[Chunk]:
|
|
"""Chunk by sentences."""
|
|
sentences = self._split_sentences(content)
|
|
|
|
if not sentences:
|
|
return []
|
|
|
|
chunks: list[Chunk] = []
|
|
current_sentences: list[str] = []
|
|
current_tokens = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_tokens = self.count_tokens(sentence)
|
|
|
|
# Handle sentences larger than chunk size
|
|
if sentence_tokens > self.chunk_size:
|
|
if current_sentences:
|
|
chunk_text = " ".join(current_sentences)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=1,
|
|
end_line=1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
current_sentences = []
|
|
current_tokens = 0
|
|
|
|
# Truncate large sentence
|
|
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=truncated,
|
|
source_path=source_path,
|
|
start_line=1,
|
|
end_line=1,
|
|
file_type=file_type,
|
|
metadata={**metadata, "truncated": True},
|
|
)
|
|
)
|
|
continue
|
|
|
|
# Check if adding sentence exceeds limit
|
|
if current_tokens + sentence_tokens > self.chunk_size and current_sentences:
|
|
chunk_text = " ".join(current_sentences)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=1,
|
|
end_line=1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
# Overlap: keep last sentence if small enough
|
|
overlap = None
|
|
if (
|
|
current_sentences
|
|
and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap
|
|
):
|
|
overlap = current_sentences[-1]
|
|
|
|
current_sentences = [overlap] if overlap else []
|
|
current_tokens = self.count_tokens(overlap) if overlap else 0
|
|
|
|
current_sentences.append(sentence)
|
|
current_tokens += sentence_tokens
|
|
|
|
# Final chunk
|
|
if current_sentences:
|
|
chunk_text = " ".join(current_sentences)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=1,
|
|
end_line=content.count("\n") + 1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return chunks
|
|
|
|
def _split_sentences(self, text: str) -> list[str]:
|
|
"""Split text into sentences."""
|
|
# Handle common sentence endings
|
|
# This is a simple approach - production might use nltk or spacy
|
|
sentence_pattern = re.compile(
|
|
r"(?<=[.!?])\s+(?=[A-Z])|" # Standard sentence ending
|
|
r"(?<=[.!?])\s*$|" # End of text
|
|
r"(?<=\n)\s*(?=\S)" # Newlines as boundaries
|
|
)
|
|
|
|
sentences = sentence_pattern.split(text)
|
|
return [s.strip() for s in sentences if s.strip()]
|
|
|
|
def _split_large_text(
|
|
self,
|
|
text: str,
|
|
source_path: str | None,
|
|
file_type: FileType | None,
|
|
metadata: dict[str, Any],
|
|
base_line: int,
|
|
) -> list[Chunk]:
|
|
"""Split text that exceeds chunk size."""
|
|
# First try sentences
|
|
sentences = self._split_sentences(text)
|
|
|
|
if len(sentences) > 1:
|
|
return self._chunk_by_sentences(text, source_path, file_type, metadata)
|
|
|
|
# Fall back to word-based splitting
|
|
return self._chunk_by_words(text, source_path, file_type, metadata, base_line)
|
|
|
|
def _chunk_by_words(
|
|
self,
|
|
text: str,
|
|
source_path: str | None,
|
|
file_type: FileType | None,
|
|
metadata: dict[str, Any],
|
|
base_line: int,
|
|
) -> list[Chunk]:
|
|
"""Last resort: chunk by words."""
|
|
words = text.split()
|
|
chunks: list[Chunk] = []
|
|
current_words: list[str] = []
|
|
current_tokens = 0
|
|
|
|
for word in words:
|
|
word_tokens = self.count_tokens(word + " ")
|
|
|
|
if current_tokens + word_tokens > self.chunk_size and current_words:
|
|
chunk_text = " ".join(current_words)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
# Word overlap
|
|
overlap_count = 0
|
|
overlap_words: list[str] = []
|
|
for w in reversed(current_words):
|
|
w_tokens = self.count_tokens(w + " ")
|
|
if overlap_count + w_tokens > self.chunk_overlap:
|
|
break
|
|
overlap_words.insert(0, w)
|
|
overlap_count += w_tokens
|
|
|
|
current_words = overlap_words
|
|
current_tokens = overlap_count
|
|
|
|
current_words.append(word)
|
|
current_tokens += word_tokens
|
|
|
|
# Final chunk
|
|
if current_words:
|
|
chunk_text = " ".join(current_words)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text,
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return chunks
|