feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/chunking/text.py
+++ b/mcp-servers/knowledge-base/chunking/text.py
@@ -0,0 +1,389 @@
+"""
+Plain text chunking implementation.
+
+Provides simple text chunking with paragraph and sentence
+boundary detection.
+"""
+
+import logging
+import re
+from typing import Any
+
+from chunking.base import BaseChunker
+from config import Settings
+from models import Chunk, ChunkType, FileType
+
+logger = logging.getLogger(__name__)
+
+
+class TextChunker(BaseChunker):
+    """
+    Plain text chunker with paragraph awareness.
+
+    Features:
+    - Splits on paragraph boundaries
+    - Falls back to sentence/word boundaries
+    - Configurable overlap for context preservation
+    """
+
+    def __init__(
+        self,
+        chunk_size: int,
+        chunk_overlap: int,
+        settings: Settings | None = None,
+    ) -> None:
+        """Initialize text chunker."""
+        super().__init__(chunk_size, chunk_overlap, settings)
+
+    @property
+    def chunk_type(self) -> ChunkType:
+        """Get chunk type."""
+        return ChunkType.TEXT
+
+    def chunk(
+        self,
+        content: str,
+        source_path: str | None = None,
+        file_type: FileType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> list[Chunk]:
+        """
+        Chunk plain text content.
+
+        Tries paragraph boundaries first, then sentences.
+        """
+        if not content.strip():
+            return []
+
+        metadata = metadata or {}
+
+        # Check if content fits in a single chunk
+        total_tokens = self.count_tokens(content)
+        if total_tokens <= self.chunk_size:
+            return [
+                self._create_chunk(
+                    content=content.strip(),
+                    source_path=source_path,
+                    start_line=1,
+                    end_line=content.count("\n") + 1,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            ]
+
+        # Try paragraph-based chunking
+        paragraphs = self._split_paragraphs(content)
+        if len(paragraphs) > 1:
+            return self._chunk_by_paragraphs(
+                paragraphs, source_path, file_type, metadata
+            )
+
+        # Fall back to sentence-based chunking
+        return self._chunk_by_sentences(
+            content, source_path, file_type, metadata
+        )
+
+    def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
+        """Split content into paragraphs."""
+        paragraphs: list[dict[str, Any]] = []
+
+        # Split on double newlines (paragraph boundaries)
+        raw_paras = re.split(r"\n\s*\n", content)
+
+        line_num = 1
+        for para in raw_paras:
+            para = para.strip()
+            if not para:
+                continue
+
+            para_lines = para.count("\n") + 1
+            paragraphs.append({
+                "content": para,
+                "tokens": self.count_tokens(para),
+                "start_line": line_num,
+                "end_line": line_num + para_lines - 1,
+            })
+            line_num += para_lines + 1  # +1 for blank line between paragraphs
+
+        return paragraphs
+
+    def _chunk_by_paragraphs(
+        self,
+        paragraphs: list[dict[str, Any]],
+        source_path: str | None,
+        file_type: FileType | None,
+        metadata: dict[str, Any],
+    ) -> list[Chunk]:
+        """Chunk by combining paragraphs up to size limit."""
+        chunks: list[Chunk] = []
+        current_paras: list[str] = []
+        current_tokens = 0
+        chunk_start = paragraphs[0]["start_line"] if paragraphs else 1
+        chunk_end = chunk_start
+
+        for para in paragraphs:
+            para_content = para["content"]
+            para_tokens = para["tokens"]
+
+            # Handle paragraphs larger than chunk size
+            if para_tokens > self.chunk_size:
+                # Flush current content
+                if current_paras:
+                    chunk_text = "\n\n".join(current_paras)
+                    chunks.append(
+                        self._create_chunk(
+                            content=chunk_text,
+                            source_path=source_path,
+                            start_line=chunk_start,
+                            end_line=chunk_end,
+                            file_type=file_type,
+                            metadata=metadata,
+                        )
+                    )
+                    current_paras = []
+                    current_tokens = 0
+
+                # Split large paragraph
+                sub_chunks = self._split_large_text(
+                    para_content,
+                    source_path,
+                    file_type,
+                    metadata,
+                    para["start_line"],
+                )
+                chunks.extend(sub_chunks)
+                chunk_start = para["end_line"] + 1
+                chunk_end = chunk_start
+                continue
+
+            # Check if adding paragraph exceeds limit
+            if current_tokens + para_tokens > self.chunk_size and current_paras:
+                chunk_text = "\n\n".join(current_paras)
+                chunks.append(
+                    self._create_chunk(
+                        content=chunk_text,
+                        source_path=source_path,
+                        start_line=chunk_start,
+                        end_line=chunk_end,
+                        file_type=file_type,
+                        metadata=metadata,
+                    )
+                )
+
+                # Overlap: keep last paragraph if small enough
+                overlap_para = None
+                if current_paras and self.count_tokens(current_paras[-1]) <= self.chunk_overlap:
+                    overlap_para = current_paras[-1]
+
+                current_paras = [overlap_para] if overlap_para else []
+                current_tokens = self.count_tokens(overlap_para) if overlap_para else 0
+                chunk_start = para["start_line"]
+
+            current_paras.append(para_content)
+            current_tokens += para_tokens
+            chunk_end = para["end_line"]
+
+        # Final chunk
+        if current_paras:
+            chunk_text = "\n\n".join(current_paras)
+            chunks.append(
+                self._create_chunk(
+                    content=chunk_text,
+                    source_path=source_path,
+                    start_line=chunk_start,
+                    end_line=chunk_end,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            )
+
+        return chunks
+
+    def _chunk_by_sentences(
+        self,
+        content: str,
+        source_path: str | None,
+        file_type: FileType | None,
+        metadata: dict[str, Any],
+    ) -> list[Chunk]:
+        """Chunk by sentences."""
+        sentences = self._split_sentences(content)
+
+        if not sentences:
+            return []
+
+        chunks: list[Chunk] = []
+        current_sentences: list[str] = []
+        current_tokens = 0
+
+        for sentence in sentences:
+            sentence_tokens = self.count_tokens(sentence)
+
+            # Handle sentences larger than chunk size
+            if sentence_tokens > self.chunk_size:
+                if current_sentences:
+                    chunk_text = " ".join(current_sentences)
+                    chunks.append(
+                        self._create_chunk(
+                            content=chunk_text,
+                            source_path=source_path,
+                            start_line=1,
+                            end_line=1,
+                            file_type=file_type,
+                            metadata=metadata,
+                        )
+                    )
+                    current_sentences = []
+                    current_tokens = 0
+
+                # Truncate large sentence
+                truncated = self.truncate_to_tokens(sentence, self.chunk_size)
+                chunks.append(
+                    self._create_chunk(
+                        content=truncated,
+                        source_path=source_path,
+                        start_line=1,
+                        end_line=1,
+                        file_type=file_type,
+                        metadata={**metadata, "truncated": True},
+                    )
+                )
+                continue
+
+            # Check if adding sentence exceeds limit
+            if current_tokens + sentence_tokens > self.chunk_size and current_sentences:
+                chunk_text = " ".join(current_sentences)
+                chunks.append(
+                    self._create_chunk(
+                        content=chunk_text,
+                        source_path=source_path,
+                        start_line=1,
+                        end_line=1,
+                        file_type=file_type,
+                        metadata=metadata,
+                    )
+                )
+
+                # Overlap: keep last sentence if small enough
+                overlap = None
+                if current_sentences and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap:
+                    overlap = current_sentences[-1]
+
+                current_sentences = [overlap] if overlap else []
+                current_tokens = self.count_tokens(overlap) if overlap else 0
+
+            current_sentences.append(sentence)
+            current_tokens += sentence_tokens
+
+        # Final chunk
+        if current_sentences:
+            chunk_text = " ".join(current_sentences)
+            chunks.append(
+                self._create_chunk(
+                    content=chunk_text,
+                    source_path=source_path,
+                    start_line=1,
+                    end_line=content.count("\n") + 1,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            )
+
+        return chunks
+
+    def _split_sentences(self, text: str) -> list[str]:
+        """Split text into sentences."""
+        # Handle common sentence endings
+        # This is a simple approach - production might use nltk or spacy
+        sentence_pattern = re.compile(
+            r"(?<=[.!?])\s+(?=[A-Z])|"  # Standard sentence ending
+            r"(?<=[.!?])\s*$|"  # End of text
+            r"(?<=\n)\s*(?=\S)"  # Newlines as boundaries
+        )
+
+        sentences = sentence_pattern.split(text)
+        return [s.strip() for s in sentences if s.strip()]
+
+    def _split_large_text(
+        self,
+        text: str,
+        source_path: str | None,
+        file_type: FileType | None,
+        metadata: dict[str, Any],
+        base_line: int,
+    ) -> list[Chunk]:
+        """Split text that exceeds chunk size."""
+        # First try sentences
+        sentences = self._split_sentences(text)
+
+        if len(sentences) > 1:
+            return self._chunk_by_sentences(
+                text, source_path, file_type, metadata
+            )
+
+        # Fall back to word-based splitting
+        return self._chunk_by_words(
+            text, source_path, file_type, metadata, base_line
+        )
+
+    def _chunk_by_words(
+        self,
+        text: str,
+        source_path: str | None,
+        file_type: FileType | None,
+        metadata: dict[str, Any],
+        base_line: int,
+    ) -> list[Chunk]:
+        """Last resort: chunk by words."""
+        words = text.split()
+        chunks: list[Chunk] = []
+        current_words: list[str] = []
+        current_tokens = 0
+
+        for word in words:
+            word_tokens = self.count_tokens(word + " ")
+
+            if current_tokens + word_tokens > self.chunk_size and current_words:
+                chunk_text = " ".join(current_words)
+                chunks.append(
+                    self._create_chunk(
+                        content=chunk_text,
+                        source_path=source_path,
+                        start_line=base_line,
+                        end_line=base_line,
+                        file_type=file_type,
+                        metadata=metadata,
+                    )
+                )
+
+                # Word overlap
+                overlap_count = 0
+                overlap_words: list[str] = []
+                for w in reversed(current_words):
+                    w_tokens = self.count_tokens(w + " ")
+                    if overlap_count + w_tokens > self.chunk_overlap:
+                        break
+                    overlap_words.insert(0, w)
+                    overlap_count += w_tokens
+
+                current_words = overlap_words
+                current_tokens = overlap_count
+
+            current_words.append(word)
+            current_tokens += word_tokens
+
+        # Final chunk
+        if current_words:
+            chunk_text = " ".join(current_words)
+            chunks.append(
+                self._create_chunk(
+                    content=chunk_text,
+                    source_path=source_path,
+                    start_line=base_line,
+                    end_line=base_line,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            )
+
+        return chunks