feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/config.py
+++ b/mcp-servers/knowledge-base/config.py
@@ -0,0 +1,138 @@
+"""
+Configuration for Knowledge Base MCP Server.
+
+Uses pydantic-settings for environment variable loading.
+"""
+
+import os
+
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    """Application settings loaded from environment."""
+
+    # Server settings
+    host: str = Field(default="0.0.0.0", description="Server host")
+    port: int = Field(default=8002, description="Server port")
+    debug: bool = Field(default=False, description="Debug mode")
+
+    # Database settings
+    database_url: str = Field(
+        default="postgresql://postgres:postgres@localhost:5432/syndarix",
+        description="PostgreSQL connection URL with pgvector extension",
+    )
+    database_pool_size: int = Field(default=10, description="Connection pool size")
+    database_pool_max_overflow: int = Field(
+        default=20, description="Max overflow connections"
+    )
+
+    # Redis settings
+    redis_url: str = Field(
+        default="redis://localhost:6379/0",
+        description="Redis connection URL",
+    )
+
+    # LLM Gateway settings (for embeddings)
+    llm_gateway_url: str = Field(
+        default="http://localhost:8001",
+        description="LLM Gateway MCP server URL",
+    )
+
+    # Embedding settings
+    embedding_model: str = Field(
+        default="text-embedding-3-large",
+        description="Default embedding model",
+    )
+    embedding_dimension: int = Field(
+        default=1536,
+        description="Embedding vector dimension",
+    )
+    embedding_batch_size: int = Field(
+        default=100,
+        description="Max texts per embedding batch",
+    )
+    embedding_cache_ttl: int = Field(
+        default=86400,
+        description="Embedding cache TTL in seconds (24 hours)",
+    )
+
+    # Chunking settings
+    code_chunk_size: int = Field(
+        default=500,
+        description="Target tokens per code chunk",
+    )
+    code_chunk_overlap: int = Field(
+        default=50,
+        description="Token overlap between code chunks",
+    )
+    markdown_chunk_size: int = Field(
+        default=800,
+        description="Target tokens per markdown chunk",
+    )
+    markdown_chunk_overlap: int = Field(
+        default=100,
+        description="Token overlap between markdown chunks",
+    )
+    text_chunk_size: int = Field(
+        default=400,
+        description="Target tokens per text chunk",
+    )
+    text_chunk_overlap: int = Field(
+        default=50,
+        description="Token overlap between text chunks",
+    )
+
+    # Search settings
+    search_default_limit: int = Field(
+        default=10,
+        description="Default number of search results",
+    )
+    search_max_limit: int = Field(
+        default=100,
+        description="Maximum number of search results",
+    )
+    semantic_threshold: float = Field(
+        default=0.7,
+        description="Minimum similarity score for semantic search",
+    )
+    hybrid_semantic_weight: float = Field(
+        default=0.7,
+        description="Weight for semantic results in hybrid search",
+    )
+    hybrid_keyword_weight: float = Field(
+        default=0.3,
+        description="Weight for keyword results in hybrid search",
+    )
+
+    # Storage settings
+    embedding_ttl_days: int = Field(
+        default=30,
+        description="TTL for embedding records in days (0 = no expiry)",
+    )
+
+    model_config = {"env_prefix": "KB_", "env_file": ".env", "extra": "ignore"}
+
+
+# Global settings instance (lazy initialization)
+_settings: Settings | None = None
+
+
+def get_settings() -> Settings:
+    """Get the global settings instance."""
+    global _settings
+    if _settings is None:
+        _settings = Settings()
+    return _settings
+
+
+def reset_settings() -> None:
+    """Reset the global settings (for testing)."""
+    global _settings
+    _settings = None
+
+
+def is_test_mode() -> bool:
+    """Check if running in test mode."""
+    return os.getenv("IS_TEST", "").lower() in ("true", "1", "yes")