""" Knowledge Base MCP Server. Provides RAG capabilities with pgvector for semantic search, intelligent chunking, and collection management. """ import logging from contextlib import asynccontextmanager from typing import Any from fastapi import FastAPI from fastmcp import FastMCP from pydantic import Field from collection_manager import CollectionManager, get_collection_manager from collections.abc import AsyncIterator from config import get_settings from database import DatabaseManager, get_database_manager from embeddings import EmbeddingGenerator, get_embedding_generator from exceptions import KnowledgeBaseError from models import ( ChunkType, DeleteRequest, FileType, IngestRequest, SearchRequest, SearchType, ) from search import SearchEngine, get_search_engine # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) # Global instances _database: DatabaseManager | None = None _embeddings: EmbeddingGenerator | None = None _search: SearchEngine | None = None _collections: CollectionManager | None = None @asynccontextmanager async def lifespan(_app: FastAPI) -> AsyncIterator[None]: """Application lifespan handler.""" global _database, _embeddings, _search, _collections logger.info("Starting Knowledge Base MCP Server...") # Initialize database _database = get_database_manager() await _database.initialize() # Initialize embedding generator _embeddings = get_embedding_generator() await _embeddings.initialize() # Initialize search engine _search = get_search_engine() # Initialize collection manager _collections = get_collection_manager() logger.info("Knowledge Base MCP Server started successfully") yield # Cleanup logger.info("Shutting down Knowledge Base MCP Server...") if _embeddings: await _embeddings.close() if _database: await _database.close() logger.info("Knowledge Base MCP Server shut down") # Create FastMCP server mcp = FastMCP("syndarix-knowledge-base") # Create FastAPI app with lifespan app = FastAPI( title="Knowledge Base MCP Server", description="RAG with pgvector for semantic search", version="0.1.0", lifespan=lifespan, ) @app.get("/health") async def health_check() -> dict[str, Any]: """Health check endpoint.""" status: dict[str, Any] = { "status": "healthy", "service": "knowledge-base", "version": "0.1.0", } # Check database connection try: if _database and _database._pool: async with _database.acquire() as conn: await conn.fetchval("SELECT 1") status["database"] = "connected" else: status["database"] = "not initialized" except Exception as e: status["database"] = f"error: {e}" status["status"] = "degraded" return status # MCP Tools @mcp.tool() async def search_knowledge( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), query: str = Field(..., description="Search query"), search_type: str = Field( default="hybrid", description="Search type: semantic, keyword, or hybrid", ), collection: str | None = Field( default=None, description="Collection to search (None = all)", ), limit: int = Field( default=10, ge=1, le=100, description="Maximum number of results", ), threshold: float = Field( default=0.7, ge=0.0, le=1.0, description="Minimum similarity score", ), file_types: list[str] | None = Field( default=None, description="Filter by file types (python, javascript, etc.)", ), ) -> dict[str, Any]: """ Search the knowledge base for relevant content. Supports semantic (vector), keyword (full-text), and hybrid search. Returns chunks ranked by relevance to the query. """ try: # Parse search type try: search_type_enum = SearchType(search_type.lower()) except ValueError: valid_types = [t.value for t in SearchType] return { "success": False, "error": f"Invalid search type: {search_type}. Valid types: {valid_types}", } # Parse file types file_type_enums = None if file_types: try: file_type_enums = [FileType(ft.lower()) for ft in file_types] except ValueError as e: return { "success": False, "error": f"Invalid file type: {e}", } request = SearchRequest( project_id=project_id, agent_id=agent_id, query=query, search_type=search_type_enum, collection=collection, limit=limit, threshold=threshold, file_types=file_type_enums, ) response = await _search.search(request) # type: ignore[union-attr] return { "success": True, "query": response.query, "search_type": response.search_type, "results": [ { "id": r.id, "content": r.content, "score": r.score, "source_path": r.source_path, "start_line": r.start_line, "end_line": r.end_line, "chunk_type": r.chunk_type, "file_type": r.file_type, "collection": r.collection, "metadata": r.metadata, } for r in response.results ], "total_results": response.total_results, "search_time_ms": response.search_time_ms, } except KnowledgeBaseError as e: logger.error(f"Search error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected search error: {e}") return { "success": False, "error": str(e), } @mcp.tool() async def ingest_content( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), content: str = Field(..., description="Content to ingest"), source_path: str | None = Field( default=None, description="Source file path for reference", ), collection: str = Field( default="default", description="Collection to store in", ), chunk_type: str = Field( default="text", description="Content type: code, markdown, or text", ), file_type: str | None = Field( default=None, description="File type for code chunking (python, javascript, etc.)", ), metadata: dict[str, Any] | None = Field( default=None, description="Additional metadata to store", ), ) -> dict[str, Any]: """ Ingest content into the knowledge base. Content is automatically chunked based on type, embedded using the LLM Gateway, and stored in pgvector for search. """ try: # Parse chunk type try: chunk_type_enum = ChunkType(chunk_type.lower()) except ValueError: valid_types = [t.value for t in ChunkType] return { "success": False, "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}", } # Parse file type file_type_enum = None if file_type: try: file_type_enum = FileType(file_type.lower()) except ValueError: valid_types = [t.value for t in FileType] return { "success": False, "error": f"Invalid file type: {file_type}. Valid types: {valid_types}", } request = IngestRequest( project_id=project_id, agent_id=agent_id, content=content, source_path=source_path, collection=collection, chunk_type=chunk_type_enum, file_type=file_type_enum, metadata=metadata or {}, ) result = await _collections.ingest(request) # type: ignore[union-attr] return { "success": result.success, "chunks_created": result.chunks_created, "embeddings_generated": result.embeddings_generated, "source_path": result.source_path, "collection": result.collection, "chunk_ids": result.chunk_ids, "error": result.error, } except KnowledgeBaseError as e: logger.error(f"Ingest error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected ingest error: {e}") return { "success": False, "error": str(e), } @mcp.tool() async def delete_content( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), source_path: str | None = Field( default=None, description="Delete by source file path", ), collection: str | None = Field( default=None, description="Delete entire collection", ), chunk_ids: list[str] | None = Field( default=None, description="Delete specific chunk IDs", ), ) -> dict[str, Any]: """ Delete content from the knowledge base. Specify either source_path, collection, or chunk_ids to delete. """ try: request = DeleteRequest( project_id=project_id, agent_id=agent_id, source_path=source_path, collection=collection, chunk_ids=chunk_ids, ) result = await _collections.delete(request) # type: ignore[union-attr] return { "success": result.success, "chunks_deleted": result.chunks_deleted, "error": result.error, } except KnowledgeBaseError as e: logger.error(f"Delete error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected delete error: {e}") return { "success": False, "error": str(e), } @mcp.tool() async def list_collections( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), # noqa: ARG001 ) -> dict[str, Any]: """ List all collections in a project's knowledge base. Returns collection names with chunk counts and file types. """ try: result = await _collections.list_collections(project_id) # type: ignore[union-attr] return { "success": True, "project_id": result.project_id, "collections": [ { "name": c.name, "chunk_count": c.chunk_count, "total_tokens": c.total_tokens, "file_types": c.file_types, "created_at": c.created_at.isoformat(), "updated_at": c.updated_at.isoformat(), } for c in result.collections ], "total_collections": result.total_collections, } except KnowledgeBaseError as e: logger.error(f"List collections error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected list collections error: {e}") return { "success": False, "error": str(e), } @mcp.tool() async def get_collection_stats( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), # noqa: ARG001 collection: str = Field(..., description="Collection name"), ) -> dict[str, Any]: """ Get detailed statistics for a collection. Returns chunk counts, token totals, and type breakdowns. """ try: stats = await _collections.get_collection_stats(project_id, collection) # type: ignore[union-attr] return { "success": True, "collection": stats.collection, "project_id": stats.project_id, "chunk_count": stats.chunk_count, "unique_sources": stats.unique_sources, "total_tokens": stats.total_tokens, "avg_chunk_size": stats.avg_chunk_size, "chunk_types": stats.chunk_types, "file_types": stats.file_types, "oldest_chunk": stats.oldest_chunk.isoformat() if stats.oldest_chunk else None, "newest_chunk": stats.newest_chunk.isoformat() if stats.newest_chunk else None, } except KnowledgeBaseError as e: logger.error(f"Get collection stats error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected get collection stats error: {e}") return { "success": False, "error": str(e), } @mcp.tool() async def update_document( project_id: str = Field(..., description="Project ID for scoping"), agent_id: str = Field(..., description="Agent ID making the request"), source_path: str = Field(..., description="Source file path"), content: str = Field(..., description="New content"), collection: str = Field( default="default", description="Collection name", ), chunk_type: str = Field( default="text", description="Content type: code, markdown, or text", ), file_type: str | None = Field( default=None, description="File type for code chunking", ), metadata: dict[str, Any] | None = Field( default=None, description="Additional metadata", ), ) -> dict[str, Any]: """ Update a document in the knowledge base. Replaces all existing chunks for the source path with new content. """ try: # Parse chunk type try: chunk_type_enum = ChunkType(chunk_type.lower()) except ValueError: valid_types = [t.value for t in ChunkType] return { "success": False, "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}", } # Parse file type file_type_enum = None if file_type: try: file_type_enum = FileType(file_type.lower()) except ValueError: valid_types = [t.value for t in FileType] return { "success": False, "error": f"Invalid file type: {file_type}. Valid types: {valid_types}", } result = await _collections.update_document( # type: ignore[union-attr] project_id=project_id, agent_id=agent_id, source_path=source_path, content=content, collection=collection, chunk_type=chunk_type_enum, file_type=file_type_enum, metadata=metadata, ) return { "success": result.success, "chunks_created": result.chunks_created, "embeddings_generated": result.embeddings_generated, "source_path": result.source_path, "collection": result.collection, "chunk_ids": result.chunk_ids, "error": result.error, } except KnowledgeBaseError as e: logger.error(f"Update document error: {e}") return { "success": False, "error": e.message, "code": e.code.value, } except Exception as e: logger.error(f"Unexpected update document error: {e}") return { "success": False, "error": str(e), } def main() -> None: """Run the server.""" import uvicorn settings = get_settings() uvicorn.run( "server:app", host=settings.host, port=settings.port, reload=settings.debug, log_level="info", ) if __name__ == "__main__": main()