feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search:

- Intelligent chunking strategies (code-aware, markdown-aware, text)
- Semantic search with vector similarity (HNSW index)
- Keyword search with PostgreSQL full-text search
- Hybrid search using Reciprocal Rank Fusion (RRF)
- Redis caching for embeddings
- Collection management (ingest, search, delete, stats)
- FastMCP tools: search_knowledge, ingest_content, delete_content,
  list_collections, get_collection_stats, update_document

Testing:
- 128 comprehensive tests covering all components
- 58% code coverage (database integration tests use mocks)
- Passes ruff linting and mypy type checking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions

View File

@@ -1,162 +1,569 @@
"""
Syndarix Knowledge Base MCP Server.
Knowledge Base MCP Server.
Provides RAG capabilities with:
- pgvector for semantic search
- Per-project collection isolation
- Hybrid search (vector + keyword)
- Chunking strategies for code, markdown, and text
Per ADR-008: Knowledge Base RAG Architecture.
Provides RAG capabilities with pgvector for semantic search,
intelligent chunking, and collection management.
"""
import os
import logging
from contextlib import asynccontextmanager
from typing import Any
from fastapi import FastAPI
from fastmcp import FastMCP
from pydantic import Field
# Create MCP server
mcp = FastMCP(
"syndarix-knowledge-base",
from collection_manager import CollectionManager, get_collection_manager
from collections.abc import AsyncIterator
from config import get_settings
from database import DatabaseManager, get_database_manager
from embeddings import EmbeddingGenerator, get_embedding_generator
from exceptions import KnowledgeBaseError
from models import (
ChunkType,
DeleteRequest,
FileType,
IngestRequest,
SearchRequest,
SearchType,
)
from search import SearchEngine, get_search_engine
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# Global instances
_database: DatabaseManager | None = None
_embeddings: EmbeddingGenerator | None = None
_search: SearchEngine | None = None
_collections: CollectionManager | None = None
@asynccontextmanager
async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
"""Application lifespan handler."""
global _database, _embeddings, _search, _collections
logger.info("Starting Knowledge Base MCP Server...")
# Initialize database
_database = get_database_manager()
await _database.initialize()
# Initialize embedding generator
_embeddings = get_embedding_generator()
await _embeddings.initialize()
# Initialize search engine
_search = get_search_engine()
# Initialize collection manager
_collections = get_collection_manager()
logger.info("Knowledge Base MCP Server started successfully")
yield
# Cleanup
logger.info("Shutting down Knowledge Base MCP Server...")
if _embeddings:
await _embeddings.close()
if _database:
await _database.close()
logger.info("Knowledge Base MCP Server shut down")
# Create FastMCP server
mcp = FastMCP("syndarix-knowledge-base")
# Create FastAPI app with lifespan
app = FastAPI(
title="Knowledge Base MCP Server",
description="RAG with pgvector for semantic search",
version="0.1.0",
lifespan=lifespan,
)
# Configuration
DATABASE_URL = os.getenv("DATABASE_URL")
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint."""
status: dict[str, Any] = {
"status": "healthy",
"service": "knowledge-base",
"version": "0.1.0",
}
# Check database connection
try:
if _database and _database._pool:
async with _database.acquire() as conn:
await conn.fetchval("SELECT 1")
status["database"] = "connected"
else:
status["database"] = "not initialized"
except Exception as e:
status["database"] = f"error: {e}"
status["status"] = "degraded"
return status
# MCP Tools
@mcp.tool()
async def search_knowledge(
project_id: str,
query: str,
top_k: int = 10,
search_type: str = "hybrid",
filters: dict | None = None,
) -> dict:
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"),
query: str = Field(..., description="Search query"),
search_type: str = Field(
default="hybrid",
description="Search type: semantic, keyword, or hybrid",
),
collection: str | None = Field(
default=None,
description="Collection to search (None = all)",
),
limit: int = Field(
default=10,
ge=1,
le=100,
description="Maximum number of results",
),
threshold: float = Field(
default=0.7,
ge=0.0,
le=1.0,
description="Minimum similarity score",
),
file_types: list[str] | None = Field(
default=None,
description="Filter by file types (python, javascript, etc.)",
),
) -> dict[str, Any]:
"""
Search the project knowledge base.
Search the knowledge base for relevant content.
Args:
project_id: UUID of the project (scopes to project collection)
query: Search query text
top_k: Number of results to return
search_type: Search type (semantic, keyword, hybrid)
filters: Optional filters (file_type, path_prefix, etc.)
Returns:
List of matching documents with scores
Supports semantic (vector), keyword (full-text), and hybrid search.
Returns chunks ranked by relevance to the query.
"""
# TODO: Implement pgvector search
# 1. Generate query embedding via LLM Gateway
# 2. Search project-scoped collection
# 3. Apply filters
# 4. Return results with scores
return {
"status": "not_implemented",
"project_id": project_id,
"query": query,
}
try:
# Parse search type
try:
search_type_enum = SearchType(search_type.lower())
except ValueError:
valid_types = [t.value for t in SearchType]
return {
"success": False,
"error": f"Invalid search type: {search_type}. Valid types: {valid_types}",
}
# Parse file types
file_type_enums = None
if file_types:
try:
file_type_enums = [FileType(ft.lower()) for ft in file_types]
except ValueError as e:
return {
"success": False,
"error": f"Invalid file type: {e}",
}
request = SearchRequest(
project_id=project_id,
agent_id=agent_id,
query=query,
search_type=search_type_enum,
collection=collection,
limit=limit,
threshold=threshold,
file_types=file_type_enums,
)
response = await _search.search(request) # type: ignore[union-attr]
return {
"success": True,
"query": response.query,
"search_type": response.search_type,
"results": [
{
"id": r.id,
"content": r.content,
"score": r.score,
"source_path": r.source_path,
"start_line": r.start_line,
"end_line": r.end_line,
"chunk_type": r.chunk_type,
"file_type": r.file_type,
"collection": r.collection,
"metadata": r.metadata,
}
for r in response.results
],
"total_results": response.total_results,
"search_time_ms": response.search_time_ms,
}
except KnowledgeBaseError as e:
logger.error(f"Search error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected search error: {e}")
return {
"success": False,
"error": str(e),
}
@mcp.tool()
async def ingest_document(
project_id: str,
content: str,
source_path: str,
doc_type: str = "text",
metadata: dict | None = None,
) -> dict:
async def ingest_content(
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"),
content: str = Field(..., description="Content to ingest"),
source_path: str | None = Field(
default=None,
description="Source file path for reference",
),
collection: str = Field(
default="default",
description="Collection to store in",
),
chunk_type: str = Field(
default="text",
description="Content type: code, markdown, or text",
),
file_type: str | None = Field(
default=None,
description="File type for code chunking (python, javascript, etc.)",
),
metadata: dict[str, Any] | None = Field(
default=None,
description="Additional metadata to store",
),
) -> dict[str, Any]:
"""
Ingest a document into the knowledge base.
Ingest content into the knowledge base.
Args:
project_id: UUID of the project
content: Document content
source_path: Original file path for reference
doc_type: Document type (code, markdown, text)
metadata: Additional metadata
Returns:
Ingestion result with chunk count
Content is automatically chunked based on type, embedded using
the LLM Gateway, and stored in pgvector for search.
"""
# TODO: Implement document ingestion
# 1. Apply chunking strategy based on doc_type
# 2. Generate embeddings for chunks
# 3. Store in project collection
return {
"status": "not_implemented",
"project_id": project_id,
"source_path": source_path,
}
try:
# Parse chunk type
try:
chunk_type_enum = ChunkType(chunk_type.lower())
except ValueError:
valid_types = [t.value for t in ChunkType]
return {
"success": False,
"error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
}
# Parse file type
file_type_enum = None
if file_type:
try:
file_type_enum = FileType(file_type.lower())
except ValueError:
valid_types = [t.value for t in FileType]
return {
"success": False,
"error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
}
request = IngestRequest(
project_id=project_id,
agent_id=agent_id,
content=content,
source_path=source_path,
collection=collection,
chunk_type=chunk_type_enum,
file_type=file_type_enum,
metadata=metadata or {},
)
result = await _collections.ingest(request) # type: ignore[union-attr]
return {
"success": result.success,
"chunks_created": result.chunks_created,
"embeddings_generated": result.embeddings_generated,
"source_path": result.source_path,
"collection": result.collection,
"chunk_ids": result.chunk_ids,
"error": result.error,
}
except KnowledgeBaseError as e:
logger.error(f"Ingest error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected ingest error: {e}")
return {
"success": False,
"error": str(e),
}
@mcp.tool()
async def ingest_repository(
project_id: str,
repo_path: str,
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
) -> dict:
async def delete_content(
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"),
source_path: str | None = Field(
default=None,
description="Delete by source file path",
),
collection: str | None = Field(
default=None,
description="Delete entire collection",
),
chunk_ids: list[str] | None = Field(
default=None,
description="Delete specific chunk IDs",
),
) -> dict[str, Any]:
"""
Ingest an entire repository into the knowledge base.
Delete content from the knowledge base.
Args:
project_id: UUID of the project
repo_path: Path to the repository
include_patterns: Glob patterns to include (e.g., ["*.py", "*.md"])
exclude_patterns: Glob patterns to exclude (e.g., ["node_modules/*"])
Returns:
Ingestion summary with file and chunk counts
Specify either source_path, collection, or chunk_ids to delete.
"""
# TODO: Implement repository ingestion
return {
"status": "not_implemented",
"project_id": project_id,
"repo_path": repo_path,
}
try:
request = DeleteRequest(
project_id=project_id,
agent_id=agent_id,
source_path=source_path,
collection=collection,
chunk_ids=chunk_ids,
)
result = await _collections.delete(request) # type: ignore[union-attr]
return {
"success": result.success,
"chunks_deleted": result.chunks_deleted,
"error": result.error,
}
except KnowledgeBaseError as e:
logger.error(f"Delete error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected delete error: {e}")
return {
"success": False,
"error": str(e),
}
@mcp.tool()
async def delete_document(
project_id: str,
source_path: str,
) -> dict:
async def list_collections(
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"), # noqa: ARG001
) -> dict[str, Any]:
"""
Delete a document from the knowledge base.
List all collections in a project's knowledge base.
Args:
project_id: UUID of the project
source_path: Original file path
Returns:
Deletion result
Returns collection names with chunk counts and file types.
"""
# TODO: Implement document deletion
return {
"status": "not_implemented",
"project_id": project_id,
"source_path": source_path,
}
try:
result = await _collections.list_collections(project_id) # type: ignore[union-attr]
return {
"success": True,
"project_id": result.project_id,
"collections": [
{
"name": c.name,
"chunk_count": c.chunk_count,
"total_tokens": c.total_tokens,
"file_types": c.file_types,
"created_at": c.created_at.isoformat(),
"updated_at": c.updated_at.isoformat(),
}
for c in result.collections
],
"total_collections": result.total_collections,
}
except KnowledgeBaseError as e:
logger.error(f"List collections error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected list collections error: {e}")
return {
"success": False,
"error": str(e),
}
@mcp.tool()
async def get_collection_stats(project_id: str) -> dict:
async def get_collection_stats(
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"), # noqa: ARG001
collection: str = Field(..., description="Collection name"),
) -> dict[str, Any]:
"""
Get statistics for a project's knowledge base collection.
Get detailed statistics for a collection.
Args:
project_id: UUID of the project
Returns:
Collection statistics (document count, chunk count, etc.)
Returns chunk counts, token totals, and type breakdowns.
"""
# TODO: Implement collection stats
return {
"status": "not_implemented",
"project_id": project_id,
}
try:
stats = await _collections.get_collection_stats(project_id, collection) # type: ignore[union-attr]
return {
"success": True,
"collection": stats.collection,
"project_id": stats.project_id,
"chunk_count": stats.chunk_count,
"unique_sources": stats.unique_sources,
"total_tokens": stats.total_tokens,
"avg_chunk_size": stats.avg_chunk_size,
"chunk_types": stats.chunk_types,
"file_types": stats.file_types,
"oldest_chunk": stats.oldest_chunk.isoformat() if stats.oldest_chunk else None,
"newest_chunk": stats.newest_chunk.isoformat() if stats.newest_chunk else None,
}
except KnowledgeBaseError as e:
logger.error(f"Get collection stats error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected get collection stats error: {e}")
return {
"success": False,
"error": str(e),
}
@mcp.tool()
async def update_document(
project_id: str = Field(..., description="Project ID for scoping"),
agent_id: str = Field(..., description="Agent ID making the request"),
source_path: str = Field(..., description="Source file path"),
content: str = Field(..., description="New content"),
collection: str = Field(
default="default",
description="Collection name",
),
chunk_type: str = Field(
default="text",
description="Content type: code, markdown, or text",
),
file_type: str | None = Field(
default=None,
description="File type for code chunking",
),
metadata: dict[str, Any] | None = Field(
default=None,
description="Additional metadata",
),
) -> dict[str, Any]:
"""
Update a document in the knowledge base.
Replaces all existing chunks for the source path with new content.
"""
try:
# Parse chunk type
try:
chunk_type_enum = ChunkType(chunk_type.lower())
except ValueError:
valid_types = [t.value for t in ChunkType]
return {
"success": False,
"error": f"Invalid chunk type: {chunk_type}. Valid types: {valid_types}",
}
# Parse file type
file_type_enum = None
if file_type:
try:
file_type_enum = FileType(file_type.lower())
except ValueError:
valid_types = [t.value for t in FileType]
return {
"success": False,
"error": f"Invalid file type: {file_type}. Valid types: {valid_types}",
}
result = await _collections.update_document( # type: ignore[union-attr]
project_id=project_id,
agent_id=agent_id,
source_path=source_path,
content=content,
collection=collection,
chunk_type=chunk_type_enum,
file_type=file_type_enum,
metadata=metadata,
)
return {
"success": result.success,
"chunks_created": result.chunks_created,
"embeddings_generated": result.embeddings_generated,
"source_path": result.source_path,
"collection": result.collection,
"chunk_ids": result.chunk_ids,
"error": result.error,
}
except KnowledgeBaseError as e:
logger.error(f"Update document error: {e}")
return {
"success": False,
"error": e.message,
"code": e.code.value,
}
except Exception as e:
logger.error(f"Unexpected update document error: {e}")
return {
"success": False,
"error": str(e),
}
def main() -> None:
"""Run the server."""
import uvicorn
settings = get_settings()
uvicorn.run(
"server:app",
host=settings.host,
port=settings.port,
reload=settings.debug,
log_level="info",
)
if __name__ == "__main__":
mcp.run()
main()