feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/chunking/base.py
+++ b/mcp-servers/knowledge-base/chunking/base.py
@@ -0,0 +1,281 @@
+"""
+Base chunker implementation.
+
+Provides abstract interface and common utilities for content chunking.
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any
+
+import tiktoken
+
+from config import Settings, get_settings
+from exceptions import ChunkingError
+from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType
+
+logger = logging.getLogger(__name__)
+
+
+class BaseChunker(ABC):
+    """
+    Abstract base class for content chunkers.
+
+    Subclasses implement specific chunking strategies for
+    different content types (code, markdown, text).
+    """
+
+    def __init__(
+        self,
+        chunk_size: int,
+        chunk_overlap: int,
+        settings: Settings | None = None,
+    ) -> None:
+        """
+        Initialize chunker.
+
+        Args:
+            chunk_size: Target tokens per chunk
+            chunk_overlap: Token overlap between chunks
+            settings: Application settings
+        """
+        self._settings = settings or get_settings()
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+        # Use cl100k_base encoding (GPT-4/text-embedding-3)
+        self._tokenizer = tiktoken.get_encoding("cl100k_base")
+
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in text."""
+        return len(self._tokenizer.encode(text))
+
+    def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
+        """Truncate text to max tokens."""
+        tokens = self._tokenizer.encode(text)
+        if len(tokens) <= max_tokens:
+            return text
+        return self._tokenizer.decode(tokens[:max_tokens])
+
+    @abstractmethod
+    def chunk(
+        self,
+        content: str,
+        source_path: str | None = None,
+        file_type: FileType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> list[Chunk]:
+        """
+        Split content into chunks.
+
+        Args:
+            content: Content to chunk
+            source_path: Source file path for reference
+            file_type: File type for specialized handling
+            metadata: Additional metadata to include
+
+        Returns:
+            List of Chunk objects
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def chunk_type(self) -> ChunkType:
+        """Get the chunk type this chunker produces."""
+        pass
+
+    def _create_chunk(
+        self,
+        content: str,
+        source_path: str | None = None,
+        start_line: int | None = None,
+        end_line: int | None = None,
+        file_type: FileType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> Chunk:
+        """Create a chunk with token count."""
+        token_count = self.count_tokens(content)
+        return Chunk(
+            content=content,
+            chunk_type=self.chunk_type,
+            file_type=file_type,
+            source_path=source_path,
+            start_line=start_line,
+            end_line=end_line,
+            metadata=metadata or {},
+            token_count=token_count,
+        )
+
+
+class ChunkerFactory:
+    """
+    Factory for creating appropriate chunkers.
+
+    Selects the best chunker based on file type or content.
+    """
+
+    def __init__(self, settings: Settings | None = None) -> None:
+        """Initialize factory."""
+        self._settings = settings or get_settings()
+        self._chunkers: dict[str, BaseChunker] = {}
+
+    def _get_code_chunker(self) -> "BaseChunker":
+        """Get or create code chunker."""
+        from chunking.code import CodeChunker
+
+        if "code" not in self._chunkers:
+            self._chunkers["code"] = CodeChunker(
+                chunk_size=self._settings.code_chunk_size,
+                chunk_overlap=self._settings.code_chunk_overlap,
+                settings=self._settings,
+            )
+        return self._chunkers["code"]
+
+    def _get_markdown_chunker(self) -> "BaseChunker":
+        """Get or create markdown chunker."""
+        from chunking.markdown import MarkdownChunker
+
+        if "markdown" not in self._chunkers:
+            self._chunkers["markdown"] = MarkdownChunker(
+                chunk_size=self._settings.markdown_chunk_size,
+                chunk_overlap=self._settings.markdown_chunk_overlap,
+                settings=self._settings,
+            )
+        return self._chunkers["markdown"]
+
+    def _get_text_chunker(self) -> "BaseChunker":
+        """Get or create text chunker."""
+        from chunking.text import TextChunker
+
+        if "text" not in self._chunkers:
+            self._chunkers["text"] = TextChunker(
+                chunk_size=self._settings.text_chunk_size,
+                chunk_overlap=self._settings.text_chunk_overlap,
+                settings=self._settings,
+            )
+        return self._chunkers["text"]
+
+    def get_chunker(
+        self,
+        file_type: FileType | None = None,
+        chunk_type: ChunkType | None = None,
+    ) -> BaseChunker:
+        """
+        Get appropriate chunker for content type.
+
+        Args:
+            file_type: File type to chunk
+            chunk_type: Explicit chunk type to use
+
+        Returns:
+            Appropriate chunker instance
+        """
+        # If explicit chunk type specified, use it
+        if chunk_type:
+            if chunk_type == ChunkType.CODE:
+                return self._get_code_chunker()
+            elif chunk_type == ChunkType.MARKDOWN:
+                return self._get_markdown_chunker()
+            else:
+                return self._get_text_chunker()
+
+        # Otherwise, infer from file type
+        if file_type:
+            if file_type == FileType.MARKDOWN:
+                return self._get_markdown_chunker()
+            elif file_type in (FileType.TEXT, FileType.JSON, FileType.YAML, FileType.TOML):
+                return self._get_text_chunker()
+            else:
+                # Code files
+                return self._get_code_chunker()
+
+        # Default to text chunker
+        return self._get_text_chunker()
+
+    def get_chunker_for_path(self, source_path: str) -> tuple[BaseChunker, FileType | None]:
+        """
+        Get chunker based on file path extension.
+
+        Args:
+            source_path: File path to chunk
+
+        Returns:
+            Tuple of (chunker, file_type)
+        """
+        # Extract extension
+        ext = ""
+        if "." in source_path:
+            ext = "." + source_path.rsplit(".", 1)[-1].lower()
+
+        file_type = FILE_EXTENSION_MAP.get(ext)
+        chunker = self.get_chunker(file_type=file_type)
+
+        return chunker, file_type
+
+    def chunk_content(
+        self,
+        content: str,
+        source_path: str | None = None,
+        file_type: FileType | None = None,
+        chunk_type: ChunkType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> list[Chunk]:
+        """
+        Chunk content using appropriate strategy.
+
+        Args:
+            content: Content to chunk
+            source_path: Source file path
+            file_type: File type
+            chunk_type: Explicit chunk type
+            metadata: Additional metadata
+
+        Returns:
+            List of chunks
+        """
+        # If we have a source path but no file type, infer it
+        if source_path and not file_type:
+            chunker, file_type = self.get_chunker_for_path(source_path)
+        else:
+            chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type)
+
+        try:
+            chunks = chunker.chunk(
+                content=content,
+                source_path=source_path,
+                file_type=file_type,
+                metadata=metadata,
+            )
+
+            logger.debug(
+                f"Chunked content into {len(chunks)} chunks "
+                f"(type={chunker.chunk_type.value})"
+            )
+
+            return chunks
+
+        except Exception as e:
+            logger.error(f"Chunking error: {e}")
+            raise ChunkingError(
+                message=f"Failed to chunk content: {e}",
+                cause=e,
+            )
+
+
+# Global chunker factory instance
+_chunker_factory: ChunkerFactory | None = None
+
+
+def get_chunker_factory() -> ChunkerFactory:
+    """Get the global chunker factory instance."""
+    global _chunker_factory
+    if _chunker_factory is None:
+        _chunker_factory = ChunkerFactory()
+    return _chunker_factory
+
+
+def reset_chunker_factory() -> None:
+    """Reset the global chunker factory (for testing)."""
+    global _chunker_factory
+    _chunker_factory = None