feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/collection_manager.py
+++ b/mcp-servers/knowledge-base/collection_manager.py
@@ -0,0 +1,331 @@
+"""
+Collection management for Knowledge Base MCP Server.
+
+Provides operations for managing document collections including
+ingestion, deletion, and statistics.
+"""
+
+import logging
+from typing import Any
+
+from chunking.base import ChunkerFactory, get_chunker_factory
+from config import Settings, get_settings
+from database import DatabaseManager, get_database_manager
+from embeddings import EmbeddingGenerator, get_embedding_generator
+from models import (
+    ChunkType,
+    CollectionStats,
+    DeleteRequest,
+    DeleteResult,
+    FileType,
+    IngestRequest,
+    IngestResult,
+    ListCollectionsResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CollectionManager:
+    """
+    Manages knowledge base collections.
+
+    Handles document ingestion, chunking, embedding generation,
+    and collection operations.
+    """
+
+    def __init__(
+        self,
+        settings: Settings | None = None,
+        database: DatabaseManager | None = None,
+        embeddings: EmbeddingGenerator | None = None,
+        chunker_factory: ChunkerFactory | None = None,
+    ) -> None:
+        """Initialize collection manager."""
+        self._settings = settings or get_settings()
+        self._database = database
+        self._embeddings = embeddings
+        self._chunker_factory = chunker_factory
+
+    @property
+    def database(self) -> DatabaseManager:
+        """Get database manager."""
+        if self._database is None:
+            self._database = get_database_manager()
+        return self._database
+
+    @property
+    def embeddings(self) -> EmbeddingGenerator:
+        """Get embedding generator."""
+        if self._embeddings is None:
+            self._embeddings = get_embedding_generator()
+        return self._embeddings
+
+    @property
+    def chunker_factory(self) -> ChunkerFactory:
+        """Get chunker factory."""
+        if self._chunker_factory is None:
+            self._chunker_factory = get_chunker_factory()
+        return self._chunker_factory
+
+    async def ingest(self, request: IngestRequest) -> IngestResult:
+        """
+        Ingest content into the knowledge base.
+
+        Chunks the content, generates embeddings, and stores them.
+
+        Args:
+            request: Ingest request with content and options
+
+        Returns:
+            Ingest result with created chunk IDs
+        """
+        try:
+            # Chunk the content
+            chunks = self.chunker_factory.chunk_content(
+                content=request.content,
+                source_path=request.source_path,
+                file_type=request.file_type,
+                chunk_type=request.chunk_type,
+                metadata=request.metadata,
+            )
+
+            if not chunks:
+                return IngestResult(
+                    success=True,
+                    chunks_created=0,
+                    embeddings_generated=0,
+                    source_path=request.source_path,
+                    collection=request.collection,
+                    chunk_ids=[],
+                )
+
+            # Extract chunk contents for embedding
+            chunk_texts = [chunk.content for chunk in chunks]
+
+            # Generate embeddings
+            embeddings_list = await self.embeddings.generate_batch(
+                texts=chunk_texts,
+                project_id=request.project_id,
+                agent_id=request.agent_id,
+            )
+
+            # Store embeddings
+            chunk_ids: list[str] = []
+            for chunk, embedding in zip(chunks, embeddings_list, strict=True):
+                # Build metadata with chunk info
+                chunk_metadata = {
+                    **request.metadata,
+                    **chunk.metadata,
+                    "token_count": chunk.token_count,
+                }
+
+                chunk_id = await self.database.store_embedding(
+                    project_id=request.project_id,
+                    collection=request.collection,
+                    content=chunk.content,
+                    embedding=embedding,
+                    chunk_type=chunk.chunk_type,
+                    source_path=chunk.source_path or request.source_path,
+                    start_line=chunk.start_line,
+                    end_line=chunk.end_line,
+                    file_type=chunk.file_type or request.file_type,
+                    metadata=chunk_metadata,
+                )
+                chunk_ids.append(chunk_id)
+
+            logger.info(
+                f"Ingested {len(chunks)} chunks into collection '{request.collection}' "
+                f"for project {request.project_id}"
+            )
+
+            return IngestResult(
+                success=True,
+                chunks_created=len(chunks),
+                embeddings_generated=len(embeddings_list),
+                source_path=request.source_path,
+                collection=request.collection,
+                chunk_ids=chunk_ids,
+            )
+
+        except Exception as e:
+            logger.error(f"Ingest error: {e}")
+            return IngestResult(
+                success=False,
+                chunks_created=0,
+                embeddings_generated=0,
+                source_path=request.source_path,
+                collection=request.collection,
+                chunk_ids=[],
+                error=str(e),
+            )
+
+    async def delete(self, request: DeleteRequest) -> DeleteResult:
+        """
+        Delete content from the knowledge base.
+
+        Supports deletion by source path, collection, or chunk IDs.
+
+        Args:
+            request: Delete request with target specification
+
+        Returns:
+            Delete result with count of deleted chunks
+        """
+        try:
+            deleted_count = 0
+
+            if request.chunk_ids:
+                # Delete specific chunks
+                deleted_count = await self.database.delete_by_ids(
+                    project_id=request.project_id,
+                    chunk_ids=request.chunk_ids,
+                )
+            elif request.source_path:
+                # Delete by source path
+                deleted_count = await self.database.delete_by_source(
+                    project_id=request.project_id,
+                    source_path=request.source_path,
+                    collection=request.collection,
+                )
+            elif request.collection:
+                # Delete entire collection
+                deleted_count = await self.database.delete_collection(
+                    project_id=request.project_id,
+                    collection=request.collection,
+                )
+            else:
+                return DeleteResult(
+                    success=False,
+                    chunks_deleted=0,
+                    error="Must specify chunk_ids, source_path, or collection",
+                )
+
+            logger.info(
+                f"Deleted {deleted_count} chunks for project {request.project_id}"
+            )
+
+            return DeleteResult(
+                success=True,
+                chunks_deleted=deleted_count,
+            )
+
+        except Exception as e:
+            logger.error(f"Delete error: {e}")
+            return DeleteResult(
+                success=False,
+                chunks_deleted=0,
+                error=str(e),
+            )
+
+    async def list_collections(self, project_id: str) -> ListCollectionsResponse:
+        """
+        List all collections for a project.
+
+        Args:
+            project_id: Project ID
+
+        Returns:
+            List of collection info
+        """
+        collections = await self.database.list_collections(project_id)
+
+        return ListCollectionsResponse(
+            project_id=project_id,
+            collections=collections,
+            total_collections=len(collections),
+        )
+
+    async def get_collection_stats(
+        self,
+        project_id: str,
+        collection: str,
+    ) -> CollectionStats:
+        """
+        Get statistics for a collection.
+
+        Args:
+            project_id: Project ID
+            collection: Collection name
+
+        Returns:
+            Collection statistics
+        """
+        return await self.database.get_collection_stats(project_id, collection)
+
+    async def update_document(
+        self,
+        project_id: str,
+        agent_id: str,
+        source_path: str,
+        content: str,
+        collection: str = "default",
+        chunk_type: ChunkType = ChunkType.TEXT,
+        file_type: FileType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> IngestResult:
+        """
+        Update a document by replacing existing chunks.
+
+        Deletes existing chunks for the source path and ingests new content.
+
+        Args:
+            project_id: Project ID
+            agent_id: Agent ID
+            source_path: Source file path
+            content: New content
+            collection: Collection name
+            chunk_type: Type of content
+            file_type: File type for code chunking
+            metadata: Additional metadata
+
+        Returns:
+            Ingest result
+        """
+        # First delete existing chunks for this source
+        await self.database.delete_by_source(
+            project_id=project_id,
+            source_path=source_path,
+            collection=collection,
+        )
+
+        # Then ingest new content
+        request = IngestRequest(
+            project_id=project_id,
+            agent_id=agent_id,
+            content=content,
+            source_path=source_path,
+            collection=collection,
+            chunk_type=chunk_type,
+            file_type=file_type,
+            metadata=metadata or {},
+        )
+
+        return await self.ingest(request)
+
+    async def cleanup_expired(self) -> int:
+        """
+        Remove expired embeddings from all collections.
+
+        Returns:
+            Number of embeddings removed
+        """
+        return await self.database.cleanup_expired()
+
+
+# Global collection manager instance (lazy initialization)
+_collection_manager: CollectionManager | None = None
+
+
+def get_collection_manager() -> CollectionManager:
+    """Get the global collection manager instance."""
+    global _collection_manager
+    if _collection_manager is None:
+        _collection_manager = CollectionManager()
+    return _collection_manager
+
+
+def reset_collection_manager() -> None:
+    """Reset the global collection manager (for testing)."""
+    global _collection_manager
+    _collection_manager = None