Files
syndarix/mcp-servers/knowledge-base/chunking/text.py
Felipe Cardoso 51404216ae refactor(knowledge-base mcp server): adjust formatting for consistency and readability
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
2026-01-06 17:20:31 +01:00

392 lines
13 KiB
Python

"""
Plain text chunking implementation.
Provides simple text chunking with paragraph and sentence
boundary detection.
"""
import logging
import re
from typing import Any
from chunking.base import BaseChunker
from config import Settings
from models import Chunk, ChunkType, FileType
logger = logging.getLogger(__name__)
class TextChunker(BaseChunker):
"""
Plain text chunker with paragraph awareness.
Features:
- Splits on paragraph boundaries
- Falls back to sentence/word boundaries
- Configurable overlap for context preservation
"""
def __init__(
self,
chunk_size: int,
chunk_overlap: int,
settings: Settings | None = None,
) -> None:
"""Initialize text chunker."""
super().__init__(chunk_size, chunk_overlap, settings)
@property
def chunk_type(self) -> ChunkType:
"""Get chunk type."""
return ChunkType.TEXT
def chunk(
self,
content: str,
source_path: str | None = None,
file_type: FileType | None = None,
metadata: dict[str, Any] | None = None,
) -> list[Chunk]:
"""
Chunk plain text content.
Tries paragraph boundaries first, then sentences.
"""
if not content.strip():
return []
metadata = metadata or {}
# Check if content fits in a single chunk
total_tokens = self.count_tokens(content)
if total_tokens <= self.chunk_size:
return [
self._create_chunk(
content=content.strip(),
source_path=source_path,
start_line=1,
end_line=content.count("\n") + 1,
file_type=file_type,
metadata=metadata,
)
]
# Try paragraph-based chunking
paragraphs = self._split_paragraphs(content)
if len(paragraphs) > 1:
return self._chunk_by_paragraphs(
paragraphs, source_path, file_type, metadata
)
# Fall back to sentence-based chunking
return self._chunk_by_sentences(content, source_path, file_type, metadata)
def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
"""Split content into paragraphs."""
paragraphs: list[dict[str, Any]] = []
# Split on double newlines (paragraph boundaries)
raw_paras = re.split(r"\n\s*\n", content)
line_num = 1
for para in raw_paras:
para = para.strip()
if not para:
continue
para_lines = para.count("\n") + 1
paragraphs.append(
{
"content": para,
"tokens": self.count_tokens(para),
"start_line": line_num,
"end_line": line_num + para_lines - 1,
}
)
line_num += para_lines + 1 # +1 for blank line between paragraphs
return paragraphs
def _chunk_by_paragraphs(
self,
paragraphs: list[dict[str, Any]],
source_path: str | None,
file_type: FileType | None,
metadata: dict[str, Any],
) -> list[Chunk]:
"""Chunk by combining paragraphs up to size limit."""
chunks: list[Chunk] = []
current_paras: list[str] = []
current_tokens = 0
chunk_start = paragraphs[0]["start_line"] if paragraphs else 1
chunk_end = chunk_start
for para in paragraphs:
para_content = para["content"]
para_tokens = para["tokens"]
# Handle paragraphs larger than chunk size
if para_tokens > self.chunk_size:
# Flush current content
if current_paras:
chunk_text = "\n\n".join(current_paras)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=chunk_start,
end_line=chunk_end,
file_type=file_type,
metadata=metadata,
)
)
current_paras = []
current_tokens = 0
# Split large paragraph
sub_chunks = self._split_large_text(
para_content,
source_path,
file_type,
metadata,
para["start_line"],
)
chunks.extend(sub_chunks)
chunk_start = para["end_line"] + 1
chunk_end = chunk_start
continue
# Check if adding paragraph exceeds limit
if current_tokens + para_tokens > self.chunk_size and current_paras:
chunk_text = "\n\n".join(current_paras)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=chunk_start,
end_line=chunk_end,
file_type=file_type,
metadata=metadata,
)
)
# Overlap: keep last paragraph if small enough
overlap_para = None
if (
current_paras
and self.count_tokens(current_paras[-1]) <= self.chunk_overlap
):
overlap_para = current_paras[-1]
current_paras = [overlap_para] if overlap_para else []
current_tokens = self.count_tokens(overlap_para) if overlap_para else 0
chunk_start = para["start_line"]
current_paras.append(para_content)
current_tokens += para_tokens
chunk_end = para["end_line"]
# Final chunk
if current_paras:
chunk_text = "\n\n".join(current_paras)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=chunk_start,
end_line=chunk_end,
file_type=file_type,
metadata=metadata,
)
)
return chunks
def _chunk_by_sentences(
self,
content: str,
source_path: str | None,
file_type: FileType | None,
metadata: dict[str, Any],
) -> list[Chunk]:
"""Chunk by sentences."""
sentences = self._split_sentences(content)
if not sentences:
return []
chunks: list[Chunk] = []
current_sentences: list[str] = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
# Handle sentences larger than chunk size
if sentence_tokens > self.chunk_size:
if current_sentences:
chunk_text = " ".join(current_sentences)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=1,
end_line=1,
file_type=file_type,
metadata=metadata,
)
)
current_sentences = []
current_tokens = 0
# Truncate large sentence
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
chunks.append(
self._create_chunk(
content=truncated,
source_path=source_path,
start_line=1,
end_line=1,
file_type=file_type,
metadata={**metadata, "truncated": True},
)
)
continue
# Check if adding sentence exceeds limit
if current_tokens + sentence_tokens > self.chunk_size and current_sentences:
chunk_text = " ".join(current_sentences)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=1,
end_line=1,
file_type=file_type,
metadata=metadata,
)
)
# Overlap: keep last sentence if small enough
overlap = None
if (
current_sentences
and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap
):
overlap = current_sentences[-1]
current_sentences = [overlap] if overlap else []
current_tokens = self.count_tokens(overlap) if overlap else 0
current_sentences.append(sentence)
current_tokens += sentence_tokens
# Final chunk
if current_sentences:
chunk_text = " ".join(current_sentences)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=1,
end_line=content.count("\n") + 1,
file_type=file_type,
metadata=metadata,
)
)
return chunks
def _split_sentences(self, text: str) -> list[str]:
"""Split text into sentences."""
# Handle common sentence endings
# This is a simple approach - production might use nltk or spacy
sentence_pattern = re.compile(
r"(?<=[.!?])\s+(?=[A-Z])|" # Standard sentence ending
r"(?<=[.!?])\s*$|" # End of text
r"(?<=\n)\s*(?=\S)" # Newlines as boundaries
)
sentences = sentence_pattern.split(text)
return [s.strip() for s in sentences if s.strip()]
def _split_large_text(
self,
text: str,
source_path: str | None,
file_type: FileType | None,
metadata: dict[str, Any],
base_line: int,
) -> list[Chunk]:
"""Split text that exceeds chunk size."""
# First try sentences
sentences = self._split_sentences(text)
if len(sentences) > 1:
return self._chunk_by_sentences(text, source_path, file_type, metadata)
# Fall back to word-based splitting
return self._chunk_by_words(text, source_path, file_type, metadata, base_line)
def _chunk_by_words(
self,
text: str,
source_path: str | None,
file_type: FileType | None,
metadata: dict[str, Any],
base_line: int,
) -> list[Chunk]:
"""Last resort: chunk by words."""
words = text.split()
chunks: list[Chunk] = []
current_words: list[str] = []
current_tokens = 0
for word in words:
word_tokens = self.count_tokens(word + " ")
if current_tokens + word_tokens > self.chunk_size and current_words:
chunk_text = " ".join(current_words)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata=metadata,
)
)
# Word overlap
overlap_count = 0
overlap_words: list[str] = []
for w in reversed(current_words):
w_tokens = self.count_tokens(w + " ")
if overlap_count + w_tokens > self.chunk_overlap:
break
overlap_words.insert(0, w)
overlap_count += w_tokens
current_words = overlap_words
current_tokens = overlap_count
current_words.append(word)
current_tokens += word_tokens
# Final chunk
if current_words:
chunk_text = " ".join(current_words)
chunks.append(
self._create_chunk(
content=chunk_text,
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata=metadata,
)
)
return chunks