syndarix/mcp-servers/knowledge-base/chunking/text.py

"""
Plain text chunking implementation.

Provides simple text chunking with paragraph and sentence
boundary detection.
"""

import logging
import re
from typing import Any

from chunking.base import BaseChunker
from config import Settings
from models import Chunk, ChunkType, FileType

logger = logging.getLogger(__name__)


class TextChunker(BaseChunker):
    """
    Plain text chunker with paragraph awareness.

    Features:
    - Splits on paragraph boundaries
    - Falls back to sentence/word boundaries
    - Configurable overlap for context preservation
    """

    def __init__(
        self,
        chunk_size: int,
        chunk_overlap: int,
        settings: Settings | None = None,
    ) -> None:
        """Initialize text chunker."""
        super().__init__(chunk_size, chunk_overlap, settings)

    @property
    def chunk_type(self) -> ChunkType:
        """Get chunk type."""
        return ChunkType.TEXT

    def chunk(
        self,
        content: str,
        source_path: str | None = None,
        file_type: FileType | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> list[Chunk]:
        """
        Chunk plain text content.

        Tries paragraph boundaries first, then sentences.
        """
        if not content.strip():
            return []

        metadata = metadata or {}

        # Check if content fits in a single chunk
        total_tokens = self.count_tokens(content)
        if total_tokens <= self.chunk_size:
            return [
                self._create_chunk(
                    content=content.strip(),
                    source_path=source_path,
                    start_line=1,
                    end_line=content.count("\n") + 1,
                    file_type=file_type,
                    metadata=metadata,
                )
            ]

        # Try paragraph-based chunking
        paragraphs = self._split_paragraphs(content)
        if len(paragraphs) > 1:
            return self._chunk_by_paragraphs(
                paragraphs, source_path, file_type, metadata
            )

        # Fall back to sentence-based chunking
        return self._chunk_by_sentences(content, source_path, file_type, metadata)

    def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
        """Split content into paragraphs."""
        paragraphs: list[dict[str, Any]] = []

        # Split on double newlines (paragraph boundaries)
        raw_paras = re.split(r"\n\s*\n", content)

        line_num = 1
        for para in raw_paras:
            para = para.strip()
            if not para:
                continue

            para_lines = para.count("\n") + 1
            paragraphs.append(
                {
                    "content": para,
                    "tokens": self.count_tokens(para),
                    "start_line": line_num,
                    "end_line": line_num + para_lines - 1,
                }
            )
            line_num += para_lines + 1  # +1 for blank line between paragraphs

        return paragraphs

    def _chunk_by_paragraphs(
        self,
        paragraphs: list[dict[str, Any]],
        source_path: str | None,
        file_type: FileType | None,
        metadata: dict[str, Any],
    ) -> list[Chunk]:
        """Chunk by combining paragraphs up to size limit."""
        chunks: list[Chunk] = []
        current_paras: list[str] = []
        current_tokens = 0
        chunk_start = paragraphs[0]["start_line"] if paragraphs else 1
        chunk_end = chunk_start

        for para in paragraphs:
            para_content = para["content"]
            para_tokens = para["tokens"]

            # Handle paragraphs larger than chunk size
            if para_tokens > self.chunk_size:
                # Flush current content
                if current_paras:
                    chunk_text = "\n\n".join(current_paras)
                    chunks.append(
                        self._create_chunk(
                            content=chunk_text,
                            source_path=source_path,
                            start_line=chunk_start,
                            end_line=chunk_end,
                            file_type=file_type,
                            metadata=metadata,
                        )
                    )
                    current_paras = []
                    current_tokens = 0

                # Split large paragraph
                sub_chunks = self._split_large_text(
                    para_content,
                    source_path,
                    file_type,
                    metadata,
                    para["start_line"],
                )
                chunks.extend(sub_chunks)
                chunk_start = para["end_line"] + 1
                chunk_end = chunk_start
                continue

            # Check if adding paragraph exceeds limit
            if current_tokens + para_tokens > self.chunk_size and current_paras:
                chunk_text = "\n\n".join(current_paras)
                chunks.append(
                    self._create_chunk(
                        content=chunk_text,
                        source_path=source_path,
                        start_line=chunk_start,
                        end_line=chunk_end,
                        file_type=file_type,
                        metadata=metadata,
                    )
                )

                # Overlap: keep last paragraph if small enough
                overlap_para = None
                if (
                    current_paras
                    and self.count_tokens(current_paras[-1]) <= self.chunk_overlap
                ):
                    overlap_para = current_paras[-1]

                current_paras = [overlap_para] if overlap_para else []
                current_tokens = self.count_tokens(overlap_para) if overlap_para else 0
                chunk_start = para["start_line"]

            current_paras.append(para_content)
            current_tokens += para_tokens
            chunk_end = para["end_line"]

        # Final chunk
        if current_paras:
            chunk_text = "\n\n".join(current_paras)
            chunks.append(
                self._create_chunk(
                    content=chunk_text,
                    source_path=source_path,
                    start_line=chunk_start,
                    end_line=chunk_end,
                    file_type=file_type,
                    metadata=metadata,
                )
            )

        return chunks

    def _chunk_by_sentences(
        self,
        content: str,
        source_path: str | None,
        file_type: FileType | None,
        metadata: dict[str, Any],
    ) -> list[Chunk]:
        """Chunk by sentences."""
        sentences = self._split_sentences(content)

        if not sentences:
            return []

        chunks: list[Chunk] = []
        current_sentences: list[str] = []
        current_tokens = 0

        for sentence in sentences:
            sentence_tokens = self.count_tokens(sentence)

            # Handle sentences larger than chunk size
            if sentence_tokens > self.chunk_size:
                if current_sentences:
                    chunk_text = " ".join(current_sentences)
                    chunks.append(
                        self._create_chunk(
                            content=chunk_text,
                            source_path=source_path,
                            start_line=1,
                            end_line=1,
                            file_type=file_type,
                            metadata=metadata,
                        )
                    )
                    current_sentences = []
                    current_tokens = 0

                # Truncate large sentence
                truncated = self.truncate_to_tokens(sentence, self.chunk_size)
                chunks.append(
                    self._create_chunk(
                        content=truncated,
                        source_path=source_path,
                        start_line=1,
                        end_line=1,
                        file_type=file_type,
                        metadata={**metadata, "truncated": True},
                    )
                )
                continue

            # Check if adding sentence exceeds limit
            if current_tokens + sentence_tokens > self.chunk_size and current_sentences:
                chunk_text = " ".join(current_sentences)
                chunks.append(
                    self._create_chunk(
                        content=chunk_text,
                        source_path=source_path,
                        start_line=1,
                        end_line=1,
                        file_type=file_type,
                        metadata=metadata,
                    )
                )

                # Overlap: keep last sentence if small enough
                overlap = None
                if (
                    current_sentences
                    and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap
                ):
                    overlap = current_sentences[-1]

                current_sentences = [overlap] if overlap else []
                current_tokens = self.count_tokens(overlap) if overlap else 0

            current_sentences.append(sentence)
            current_tokens += sentence_tokens

        # Final chunk
        if current_sentences:
            chunk_text = " ".join(current_sentences)
            chunks.append(
                self._create_chunk(
                    content=chunk_text,
                    source_path=source_path,
                    start_line=1,
                    end_line=content.count("\n") + 1,
                    file_type=file_type,
                    metadata=metadata,
                )
            )

        return chunks

    def _split_sentences(self, text: str) -> list[str]:
        """Split text into sentences."""
        # Handle common sentence endings
        # This is a simple approach - production might use nltk or spacy
        sentence_pattern = re.compile(
            r"(?<=[.!?])\s+(?=[A-Z])|"  # Standard sentence ending
            r"(?<=[.!?])\s*$|"  # End of text
            r"(?<=\n)\s*(?=\S)"  # Newlines as boundaries
        )

        sentences = sentence_pattern.split(text)
        return [s.strip() for s in sentences if s.strip()]

    def _split_large_text(
        self,
        text: str,
        source_path: str | None,
        file_type: FileType | None,
        metadata: dict[str, Any],
        base_line: int,
    ) -> list[Chunk]:
        """Split text that exceeds chunk size."""
        # First try sentences
        sentences = self._split_sentences(text)

        if len(sentences) > 1:
            return self._chunk_by_sentences(text, source_path, file_type, metadata)

        # Fall back to word-based splitting
        return self._chunk_by_words(text, source_path, file_type, metadata, base_line)

    def _chunk_by_words(
        self,
        text: str,
        source_path: str | None,
        file_type: FileType | None,
        metadata: dict[str, Any],
        base_line: int,
    ) -> list[Chunk]:
        """Last resort: chunk by words."""
        words = text.split()
        chunks: list[Chunk] = []
        current_words: list[str] = []
        current_tokens = 0

        for word in words:
            word_tokens = self.count_tokens(word + " ")

            if current_tokens + word_tokens > self.chunk_size and current_words:
                chunk_text = " ".join(current_words)
                chunks.append(
                    self._create_chunk(
                        content=chunk_text,
                        source_path=source_path,
                        start_line=base_line,
                        end_line=base_line,
                        file_type=file_type,
                        metadata=metadata,
                    )
                )

                # Word overlap
                overlap_count = 0
                overlap_words: list[str] = []
                for w in reversed(current_words):
                    w_tokens = self.count_tokens(w + " ")
                    if overlap_count + w_tokens > self.chunk_overlap:
                        break
                    overlap_words.insert(0, w)
                    overlap_count += w_tokens

                current_words = overlap_words
                current_tokens = overlap_count

            current_words.append(word)
            current_tokens += word_tokens

        # Final chunk
        if current_words:
            chunk_text = " ".join(current_words)
            chunks.append(
                self._create_chunk(
                    content=chunk_text,
                    source_path=source_path,
                    start_line=base_line,
                    end_line=base_line,
                    file_type=file_type,
                    metadata=metadata,
                )
            )

        return chunks