syndarix/mcp-servers/knowledge-base/server.py

"""
Knowledge Base MCP Server.

Provides RAG capabilities with pgvector for semantic search,
intelligent chunking, and collection management.
"""

import logging
from contextlib import asynccontextmanager
from typing import Any

from fastapi import FastAPI
from fastmcp import FastMCP
from pydantic import Field

from collection_manager import CollectionManager, get_collection_manager
from collections.abc import AsyncIterator
from config import get_settings
from database import DatabaseManager, get_database_manager
from embeddings import EmbeddingGenerator, get_embedding_generator
from exceptions import KnowledgeBaseError
from models import (
    ChunkType,
    DeleteRequest,
    FileType,
    IngestRequest,
    SearchRequest,
    SearchType,
)
from search import SearchEngine, get_search_engine

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

# Global instances
_database: DatabaseManager | None = None
_embeddings: EmbeddingGenerator | None = None
_search: SearchEngine | None = None
_collections: CollectionManager | None = None


@asynccontextmanager
async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
    """Application lifespan handler."""
    global _database, _embeddings, _search, _collections

    logger.info("Starting Knowledge Base MCP Server...")

    # Initialize database
    _database = get_database_manager()
    await _database.initialize()

    # Initialize embedding generator
    _embeddings = get_embedding_generator()
    await _embeddings.initialize()

    # Initialize search engine
    _search = get_search_engine()

    # Initialize collection manager
    _collections = get_collection_manager()

    logger.info("Knowledge Base MCP Server started successfully")

    yield

    # Cleanup
    logger.info("Shutting down Knowledge Base MCP Server...")

    if _embeddings:
        await _embeddings.close()

    if _database:
        await _database.close()

    logger.info("Knowledge Base MCP Server shut down")


# Create FastMCP server
mcp = FastMCP("syndarix-knowledge-base")

# Create FastAPI app with lifespan
app = FastAPI(
    title="Knowledge Base MCP Server",
    description="RAG with pgvector for semantic search",
    version="0.1.0",
    lifespan=lifespan,
)


@app.get("/health")
async def health_check() -> dict[str, Any]:
    """Health check endpoint."""
    status: dict[str, Any] = {
        "status": "healthy",
        "service": "knowledge-base",
        "version": "0.1.0",
    }

    # Check database connection
    try:
        if _database and _database._pool:
            async with _database.acquire() as conn:
                await conn.fetchval("SELECT 1")
            status["database"] = "connected"
        else:
            status["database"] = "not initialized"
    except Exception as e:
        status["database"] = f"error: {e}"
        status["status"] = "degraded"

    return status


# MCP Tools


@mcp.tool()
async def search_knowledge(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),
    query: str = Field(..., description="Search query"),
    search_type: str = Field(
        default="hybrid",
        description="Search type: semantic, keyword, or hybrid",
    ),
    collection: str | None = Field(
        default=None,
        description="Collection to search (None = all)",
    ),
    limit: int = Field(
        default=10,
        ge=1,
        le=100,
        description="Maximum number of results",
    ),
    threshold: float = Field(
        default=0.7,
        ge=0.0,
        le=1.0,
        description="Minimum similarity score",
    ),
    file_types: list[str] | None = Field(
        default=None,
        description="Filter by file types (python, javascript, etc.)",
    ),
) -> dict[str, Any]:
    """
    Search the knowledge base for relevant content.

    Supports semantic (vector), keyword (full-text), and hybrid search.
    Returns chunks ranked by relevance to the query.
    """
    try:
        # Parse search type
        try:
            search_type_enum = SearchType(search_type.lower())
        except ValueError:
            valid_types = [t.value for t in SearchType]
            return {
                "success": False,
                "error": f"Invalid search type: {search_type}. Valid types: {valid_types}",
            }

        # Parse file types
        file_type_enums = None
        if file_types:
            try:
                file_type_enums = [FileType(ft.lower()) for ft in file_types]
            except ValueError as e:
                return {
                    "success": False,
                    "error": f"Invalid file type: {e}",
                }

        request = SearchRequest(
            project_id=project_id,
            agent_id=agent_id,
            query=query,
            search_type=search_type_enum,
            collection=collection,
            limit=limit,
            threshold=threshold,
            file_types=file_type_enums,
        )

        response = await _search.search(request)  # type: ignore[union-attr]

        return {
            "success": True,
            "query": response.query,
            "search_type": response.search_type,
            "results": [
                {
                    "id": r.id,
                    "content": r.content,
                    "score": r.score,
                    "source_path": r.source_path,
                    "start_line": r.start_line,
                    "end_line": r.end_line,
                    "chunk_type": r.chunk_type,
                    "file_type": r.file_type,
                    "collection": r.collection,
                    "metadata": r.metadata,
                }
                for r in response.results
            ],
            "total_results": response.total_results,
            "search_time_ms": response.search_time_ms,
        }

    except KnowledgeBaseError as e:
        logger.error(f"Search error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected search error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


@mcp.tool()
async def ingest_content(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),
    content: str = Field(..., description="Content to ingest"),
    source_path: str | None = Field(
        default=None,
        description="Source file path for reference",
    ),
    collection: str = Field(
        default="default",
        description="Collection to store in",
    ),
    chunk_type: str = Field(
        default="text",
        description="Content type: code, markdown, or text",
    ),
    file_type: str | None = Field(
        default=None,
        description="File type for code chunking (python, javascript, etc.)",
    ),
    metadata: dict[str, Any] | None = Field(
        default=None,
        description="Additional metadata to store",
    ),
) -> dict[str, Any]:
    """
    Ingest content into the knowledge base.

    Content is automatically chunked based on type, embedded using
    the LLM Gateway, and stored in pgvector for search.
    """
    try:
        # Parse chunk type
        try:
            chunk_type_enum = ChunkType(chunk_type.lower())
        except ValueError:
            valid_types = [t.value for t in ChunkType]
            return {
                "success": False,
                "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
            }

        # Parse file type
        file_type_enum = None
        if file_type:
            try:
                file_type_enum = FileType(file_type.lower())
            except ValueError:
                valid_types = [t.value for t in FileType]
                return {
                    "success": False,
                    "error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
                }

        request = IngestRequest(
            project_id=project_id,
            agent_id=agent_id,
            content=content,
            source_path=source_path,
            collection=collection,
            chunk_type=chunk_type_enum,
            file_type=file_type_enum,
            metadata=metadata or {},
        )

        result = await _collections.ingest(request)  # type: ignore[union-attr]

        return {
            "success": result.success,
            "chunks_created": result.chunks_created,
            "embeddings_generated": result.embeddings_generated,
            "source_path": result.source_path,
            "collection": result.collection,
            "chunk_ids": result.chunk_ids,
            "error": result.error,
        }

    except KnowledgeBaseError as e:
        logger.error(f"Ingest error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected ingest error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


@mcp.tool()
async def delete_content(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),
    source_path: str | None = Field(
        default=None,
        description="Delete by source file path",
    ),
    collection: str | None = Field(
        default=None,
        description="Delete entire collection",
    ),
    chunk_ids: list[str] | None = Field(
        default=None,
        description="Delete specific chunk IDs",
    ),
) -> dict[str, Any]:
    """
    Delete content from the knowledge base.

    Specify either source_path, collection, or chunk_ids to delete.
    """
    try:
        request = DeleteRequest(
            project_id=project_id,
            agent_id=agent_id,
            source_path=source_path,
            collection=collection,
            chunk_ids=chunk_ids,
        )

        result = await _collections.delete(request)  # type: ignore[union-attr]

        return {
            "success": result.success,
            "chunks_deleted": result.chunks_deleted,
            "error": result.error,
        }

    except KnowledgeBaseError as e:
        logger.error(f"Delete error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected delete error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


@mcp.tool()
async def list_collections(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),  # noqa: ARG001
) -> dict[str, Any]:
    """
    List all collections in a project's knowledge base.

    Returns collection names with chunk counts and file types.
    """
    try:
        result = await _collections.list_collections(project_id)  # type: ignore[union-attr]

        return {
            "success": True,
            "project_id": result.project_id,
            "collections": [
                {
                    "name": c.name,
                    "chunk_count": c.chunk_count,
                    "total_tokens": c.total_tokens,
                    "file_types": c.file_types,
                    "created_at": c.created_at.isoformat(),
                    "updated_at": c.updated_at.isoformat(),
                }
                for c in result.collections
            ],
            "total_collections": result.total_collections,
        }

    except KnowledgeBaseError as e:
        logger.error(f"List collections error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected list collections error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


@mcp.tool()
async def get_collection_stats(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),  # noqa: ARG001
    collection: str = Field(..., description="Collection name"),
) -> dict[str, Any]:
    """
    Get detailed statistics for a collection.

    Returns chunk counts, token totals, and type breakdowns.
    """
    try:
        stats = await _collections.get_collection_stats(project_id, collection)  # type: ignore[union-attr]

        return {
            "success": True,
            "collection": stats.collection,
            "project_id": stats.project_id,
            "chunk_count": stats.chunk_count,
            "unique_sources": stats.unique_sources,
            "total_tokens": stats.total_tokens,
            "avg_chunk_size": stats.avg_chunk_size,
            "chunk_types": stats.chunk_types,
            "file_types": stats.file_types,
            "oldest_chunk": stats.oldest_chunk.isoformat() if stats.oldest_chunk else None,
            "newest_chunk": stats.newest_chunk.isoformat() if stats.newest_chunk else None,
        }

    except KnowledgeBaseError as e:
        logger.error(f"Get collection stats error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected get collection stats error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


@mcp.tool()
async def update_document(
    project_id: str = Field(..., description="Project ID for scoping"),
    agent_id: str = Field(..., description="Agent ID making the request"),
    source_path: str = Field(..., description="Source file path"),
    content: str = Field(..., description="New content"),
    collection: str = Field(
        default="default",
        description="Collection name",
    ),
    chunk_type: str = Field(
        default="text",
        description="Content type: code, markdown, or text",
    ),
    file_type: str | None = Field(
        default=None,
        description="File type for code chunking",
    ),
    metadata: dict[str, Any] | None = Field(
        default=None,
        description="Additional metadata",
    ),
) -> dict[str, Any]:
    """
    Update a document in the knowledge base.

    Replaces all existing chunks for the source path with new content.
    """
    try:
        # Parse chunk type
        try:
            chunk_type_enum = ChunkType(chunk_type.lower())
        except ValueError:
            valid_types = [t.value for t in ChunkType]
            return {
                "success": False,
                "error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
            }

        # Parse file type
        file_type_enum = None
        if file_type:
            try:
                file_type_enum = FileType(file_type.lower())
            except ValueError:
                valid_types = [t.value for t in FileType]
                return {
                    "success": False,
                    "error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
                }

        result = await _collections.update_document(  # type: ignore[union-attr]
            project_id=project_id,
            agent_id=agent_id,
            source_path=source_path,
            content=content,
            collection=collection,
            chunk_type=chunk_type_enum,
            file_type=file_type_enum,
            metadata=metadata,
        )

        return {
            "success": result.success,
            "chunks_created": result.chunks_created,
            "embeddings_generated": result.embeddings_generated,
            "source_path": result.source_path,
            "collection": result.collection,
            "chunk_ids": result.chunk_ids,
            "error": result.error,
        }

    except KnowledgeBaseError as e:
        logger.error(f"Update document error: {e}")
        return {
            "success": False,
            "error": e.message,
            "code": e.code.value,
        }
    except Exception as e:
        logger.error(f"Unexpected update document error: {e}")
        return {
            "success": False,
            "error": str(e),
        }


def main() -> None:
    """Run the server."""
    import uvicorn

    settings = get_settings()

    uvicorn.run(
        "server:app",
        host=settings.host,
        port=settings.port,
        reload=settings.debug,
        log_level="info",
    )


if __name__ == "__main__":
    main()