feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/server.py
+++ b/mcp-servers/knowledge-base/server.py
@@ -1,162 +1,569 @@
 """
-Syndarix Knowledge Base MCP Server.
+Knowledge Base MCP Server.

-Provides RAG capabilities with:
- pgvector for semantic search
- Per-project collection isolation
- Hybrid search (vector + keyword)
- Chunking strategies for code, markdown, and text
-
-Per ADR-008: Knowledge Base RAG Architecture.
+Provides RAG capabilities with pgvector for semantic search,
+intelligent chunking, and collection management.
 """

-import os
+import logging
+from contextlib import asynccontextmanager
+from typing import Any

+from fastapi import FastAPI
 from fastmcp import FastMCP
+from pydantic import Field

-# Create MCP server
-mcp = FastMCP(
-    "syndarix-knowledge-base",
+from collection_manager import CollectionManager, get_collection_manager
+from collections.abc import AsyncIterator
+from config import get_settings
+from database import DatabaseManager, get_database_manager
+from embeddings import EmbeddingGenerator, get_embedding_generator
+from exceptions import KnowledgeBaseError
+from models import (
+    ChunkType,
+    DeleteRequest,
+    FileType,
+    IngestRequest,
+    SearchRequest,
+    SearchType,
+)
+from search import SearchEngine, get_search_engine
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Global instances
+_database: DatabaseManager | None = None
+_embeddings: EmbeddingGenerator | None = None
+_search: SearchEngine | None = None
+_collections: CollectionManager | None = None
+
+
+@asynccontextmanager
+async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
+    """Application lifespan handler."""
+    global _database, _embeddings, _search, _collections
+
+    logger.info("Starting Knowledge Base MCP Server...")
+
+    # Initialize database
+    _database = get_database_manager()
+    await _database.initialize()
+
+    # Initialize embedding generator
+    _embeddings = get_embedding_generator()
+    await _embeddings.initialize()
+
+    # Initialize search engine
+    _search = get_search_engine()
+
+    # Initialize collection manager
+    _collections = get_collection_manager()
+
+    logger.info("Knowledge Base MCP Server started successfully")
+
+    yield
+
+    # Cleanup
+    logger.info("Shutting down Knowledge Base MCP Server...")
+
+    if _embeddings:
+        await _embeddings.close()
+
+    if _database:
+        await _database.close()
+
+    logger.info("Knowledge Base MCP Server shut down")
+
+
+# Create FastMCP server
+mcp = FastMCP("syndarix-knowledge-base")
+
+# Create FastAPI app with lifespan
+app = FastAPI(
+    title="Knowledge Base MCP Server",
    description="RAG with pgvector for semantic search",
+    version="0.1.0",
+    lifespan=lifespan,
 )

-# Configuration
-DATABASE_URL = os.getenv("DATABASE_URL")
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint."""
+    status: dict[str, Any] = {
+        "status": "healthy",
+        "service": "knowledge-base",
+        "version": "0.1.0",
+    }
+
+    # Check database connection
+    try:
+        if _database and _database._pool:
+            async with _database.acquire() as conn:
+                await conn.fetchval("SELECT 1")
+            status["database"] = "connected"
+        else:
+            status["database"] = "not initialized"
+    except Exception as e:
+        status["database"] = f"error: {e}"
+        status["status"] = "degraded"
+
+    return status
+
+
+# MCP Tools


@mcp.tool()
 async def search_knowledge(
-    project_id: str,
-    query: str,
-    top_k: int = 10,
-    search_type: str = "hybrid",
-    filters: dict | None = None,
-) -> dict:
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),
+    query: str = Field(..., description="Search query"),
+    search_type: str = Field(
+        default="hybrid",
+        description="Search type: semantic, keyword, or hybrid",
+    ),
+    collection: str | None = Field(
+        default=None,
+        description="Collection to search (None = all)",
+    ),
+    limit: int = Field(
+        default=10,
+        ge=1,
+        le=100,
+        description="Maximum number of results",
+    ),
+    threshold: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=1.0,
+        description="Minimum similarity score",
+    ),
+    file_types: list[str] | None = Field(
+        default=None,
+        description="Filter by file types (python, javascript, etc.)",
+    ),
+) -> dict[str, Any]:
    """
-    Search the project knowledge base.
+    Search the knowledge base for relevant content.

-    Args:
-        project_id: UUID of the project (scopes to project collection)
-        query: Search query text
-        top_k: Number of results to return
-        search_type: Search type (semantic, keyword, hybrid)
-        filters: Optional filters (file_type, path_prefix, etc.)
-
-    Returns:
-        List of matching documents with scores
+    Supports semantic (vector), keyword (full-text), and hybrid search.
+    Returns chunks ranked by relevance to the query.
    """
-    # TODO: Implement pgvector search
-    # 1. Generate query embedding via LLM Gateway
-    # 2. Search project-scoped collection
-    # 3. Apply filters
-    # 4. Return results with scores
-    return {
-        "status": "not_implemented",
-        "project_id": project_id,
-        "query": query,
-    }
+    try:
+        # Parse search type
+        try:
+            search_type_enum = SearchType(search_type.lower())
+        except ValueError:
+            valid_types = [t.value for t in SearchType]
+            return {
+                "success": False,
+                "error": f"Invalid search type: {search_type}. Valid types: {valid_types}",
+            }
+
+        # Parse file types
+        file_type_enums = None
+        if file_types:
+            try:
+                file_type_enums = [FileType(ft.lower()) for ft in file_types]
+            except ValueError as e:
+                return {
+                    "success": False,
+                    "error": f"Invalid file type: {e}",
+                }
+
+        request = SearchRequest(
+            project_id=project_id,
+            agent_id=agent_id,
+            query=query,
+            search_type=search_type_enum,
+            collection=collection,
+            limit=limit,
+            threshold=threshold,
+            file_types=file_type_enums,
+        )
+
+        response = await _search.search(request)  # type: ignore[union-attr]
+
+        return {
+            "success": True,
+            "query": response.query,
+            "search_type": response.search_type,
+            "results": [
+                {
+                    "id": r.id,
+                    "content": r.content,
+                    "score": r.score,
+                    "source_path": r.source_path,
+                    "start_line": r.start_line,
+                    "end_line": r.end_line,
+                    "chunk_type": r.chunk_type,
+                    "file_type": r.file_type,
+                    "collection": r.collection,
+                    "metadata": r.metadata,
+                }
+                for r in response.results
+            ],
+            "total_results": response.total_results,
+            "search_time_ms": response.search_time_ms,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"Search error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected search error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }


@mcp.tool()
-async def ingest_document(
-    project_id: str,
-    content: str,
-    source_path: str,
-    doc_type: str = "text",
-    metadata: dict | None = None,
-) -> dict:
+async def ingest_content(
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),
+    content: str = Field(..., description="Content to ingest"),
+    source_path: str | None = Field(
+        default=None,
+        description="Source file path for reference",
+    ),
+    collection: str = Field(
+        default="default",
+        description="Collection to store in",
+    ),
+    chunk_type: str = Field(
+        default="text",
+        description="Content type: code, markdown, or text",
+    ),
+    file_type: str | None = Field(
+        default=None,
+        description="File type for code chunking (python, javascript, etc.)",
+    ),
+    metadata: dict[str, Any] | None = Field(
+        default=None,
+        description="Additional metadata to store",
+    ),
+) -> dict[str, Any]:
    """
-    Ingest a document into the knowledge base.
+    Ingest content into the knowledge base.

-    Args:
-        project_id: UUID of the project
-        content: Document content
-        source_path: Original file path for reference
-        doc_type: Document type (code, markdown, text)
-        metadata: Additional metadata
-
-    Returns:
-        Ingestion result with chunk count
+    Content is automatically chunked based on type, embedded using
+    the LLM Gateway, and stored in pgvector for search.
    """
-    # TODO: Implement document ingestion
-    # 1. Apply chunking strategy based on doc_type
-    # 2. Generate embeddings for chunks
-    # 3. Store in project collection
-    return {
-        "status": "not_implemented",
-        "project_id": project_id,
-        "source_path": source_path,
-    }
+    try:
+        # Parse chunk type
+        try:
+            chunk_type_enum = ChunkType(chunk_type.lower())
+        except ValueError:
+            valid_types = [t.value for t in ChunkType]
+            return {
+                "success": False,
+                "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
+            }
+
+        # Parse file type
+        file_type_enum = None
+        if file_type:
+            try:
+                file_type_enum = FileType(file_type.lower())
+            except ValueError:
+                valid_types = [t.value for t in FileType]
+                return {
+                    "success": False,
+                    "error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
+                }
+
+        request = IngestRequest(
+            project_id=project_id,
+            agent_id=agent_id,
+            content=content,
+            source_path=source_path,
+            collection=collection,
+            chunk_type=chunk_type_enum,
+            file_type=file_type_enum,
+            metadata=metadata or {},
+        )
+
+        result = await _collections.ingest(request)  # type: ignore[union-attr]
+
+        return {
+            "success": result.success,
+            "chunks_created": result.chunks_created,
+            "embeddings_generated": result.embeddings_generated,
+            "source_path": result.source_path,
+            "collection": result.collection,
+            "chunk_ids": result.chunk_ids,
+            "error": result.error,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"Ingest error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected ingest error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }


@mcp.tool()
-async def ingest_repository(
-    project_id: str,
-    repo_path: str,
-    include_patterns: list[str] | None = None,
-    exclude_patterns: list[str] | None = None,
-) -> dict:
+async def delete_content(
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),
+    source_path: str | None = Field(
+        default=None,
+        description="Delete by source file path",
+    ),
+    collection: str | None = Field(
+        default=None,
+        description="Delete entire collection",
+    ),
+    chunk_ids: list[str] | None = Field(
+        default=None,
+        description="Delete specific chunk IDs",
+    ),
+) -> dict[str, Any]:
    """
-    Ingest an entire repository into the knowledge base.
+    Delete content from the knowledge base.

-    Args:
-        project_id: UUID of the project
-        repo_path: Path to the repository
-        include_patterns: Glob patterns to include (e.g., ["*.py", "*.md"])
-        exclude_patterns: Glob patterns to exclude (e.g., ["node_modules/*"])
-
-    Returns:
-        Ingestion summary with file and chunk counts
+    Specify either source_path, collection, or chunk_ids to delete.
    """
-    # TODO: Implement repository ingestion
-    return {
-        "status": "not_implemented",
-        "project_id": project_id,
-        "repo_path": repo_path,
-    }
+    try:
+        request = DeleteRequest(
+            project_id=project_id,
+            agent_id=agent_id,
+            source_path=source_path,
+            collection=collection,
+            chunk_ids=chunk_ids,
+        )
+
+        result = await _collections.delete(request)  # type: ignore[union-attr]
+
+        return {
+            "success": result.success,
+            "chunks_deleted": result.chunks_deleted,
+            "error": result.error,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"Delete error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected delete error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }


@mcp.tool()
-async def delete_document(
-    project_id: str,
-    source_path: str,
-) -> dict:
+async def list_collections(
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),  # noqa: ARG001
+) -> dict[str, Any]:
    """
-    Delete a document from the knowledge base.
+    List all collections in a project's knowledge base.

-    Args:
-        project_id: UUID of the project
-        source_path: Original file path
-
-    Returns:
-        Deletion result
+    Returns collection names with chunk counts and file types.
    """
-    # TODO: Implement document deletion
-    return {
-        "status": "not_implemented",
-        "project_id": project_id,
-        "source_path": source_path,
-    }
+    try:
+        result = await _collections.list_collections(project_id)  # type: ignore[union-attr]
+
+        return {
+            "success": True,
+            "project_id": result.project_id,
+            "collections": [
+                {
+                    "name": c.name,
+                    "chunk_count": c.chunk_count,
+                    "total_tokens": c.total_tokens,
+                    "file_types": c.file_types,
+                    "created_at": c.created_at.isoformat(),
+                    "updated_at": c.updated_at.isoformat(),
+                }
+                for c in result.collections
+            ],
+            "total_collections": result.total_collections,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"List collections error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected list collections error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }


@mcp.tool()
-async def get_collection_stats(project_id: str) -> dict:
+async def get_collection_stats(
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),  # noqa: ARG001
+    collection: str = Field(..., description="Collection name"),
+) -> dict[str, Any]:
    """
-    Get statistics for a project's knowledge base collection.
+    Get detailed statistics for a collection.

-    Args:
-        project_id: UUID of the project
-
-    Returns:
-        Collection statistics (document count, chunk count, etc.)
+    Returns chunk counts, token totals, and type breakdowns.
    """
-    # TODO: Implement collection stats
-    return {
-        "status": "not_implemented",
-        "project_id": project_id,
-    }
+    try:
+        stats = await _collections.get_collection_stats(project_id, collection)  # type: ignore[union-attr]
+
+        return {
+            "success": True,
+            "collection": stats.collection,
+            "project_id": stats.project_id,
+            "chunk_count": stats.chunk_count,
+            "unique_sources": stats.unique_sources,
+            "total_tokens": stats.total_tokens,
+            "avg_chunk_size": stats.avg_chunk_size,
+            "chunk_types": stats.chunk_types,
+            "file_types": stats.file_types,
+            "oldest_chunk": stats.oldest_chunk.isoformat() if stats.oldest_chunk else None,
+            "newest_chunk": stats.newest_chunk.isoformat() if stats.newest_chunk else None,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"Get collection stats error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected get collection stats error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }
+
+
+@mcp.tool()
+async def update_document(
+    project_id: str = Field(..., description="Project ID for scoping"),
+    agent_id: str = Field(..., description="Agent ID making the request"),
+    source_path: str = Field(..., description="Source file path"),
+    content: str = Field(..., description="New content"),
+    collection: str = Field(
+        default="default",
+        description="Collection name",
+    ),
+    chunk_type: str = Field(
+        default="text",
+        description="Content type: code, markdown, or text",
+    ),
+    file_type: str | None = Field(
+        default=None,
+        description="File type for code chunking",
+    ),
+    metadata: dict[str, Any] | None = Field(
+        default=None,
+        description="Additional metadata",
+    ),
+) -> dict[str, Any]:
+    """
+    Update a document in the knowledge base.
+
+    Replaces all existing chunks for the source path with new content.
+    """
+    try:
+        # Parse chunk type
+        try:
+            chunk_type_enum = ChunkType(chunk_type.lower())
+        except ValueError:
+            valid_types = [t.value for t in ChunkType]
+            return {
+                "success": False,
+                "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
+            }
+
+        # Parse file type
+        file_type_enum = None
+        if file_type:
+            try:
+                file_type_enum = FileType(file_type.lower())
+            except ValueError:
+                valid_types = [t.value for t in FileType]
+                return {
+                    "success": False,
+                    "error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
+                }
+
+        result = await _collections.update_document(  # type: ignore[union-attr]
+            project_id=project_id,
+            agent_id=agent_id,
+            source_path=source_path,
+            content=content,
+            collection=collection,
+            chunk_type=chunk_type_enum,
+            file_type=file_type_enum,
+            metadata=metadata,
+        )
+
+        return {
+            "success": result.success,
+            "chunks_created": result.chunks_created,
+            "embeddings_generated": result.embeddings_generated,
+            "source_path": result.source_path,
+            "collection": result.collection,
+            "chunk_ids": result.chunk_ids,
+            "error": result.error,
+        }
+
+    except KnowledgeBaseError as e:
+        logger.error(f"Update document error: {e}")
+        return {
+            "success": False,
+            "error": e.message,
+            "code": e.code.value,
+        }
+    except Exception as e:
+        logger.error(f"Unexpected update document error: {e}")
+        return {
+            "success": False,
+            "error": str(e),
+        }
+
+
+def main() -> None:
+    """Run the server."""
+    import uvicorn
+
+    settings = get_settings()
+
+    uvicorn.run(
+        "server:app",
+        host=settings.host,
+        port=settings.port,
+        reload=settings.debug,
+        log_level="info",
+    )


 if __name__ == "__main__":
-    mcp.run()
+    main()