feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/models.py
+++ b/mcp-servers/knowledge-base/models.py
@@ -0,0 +1,321 @@
+"""
+Data models for Knowledge Base MCP Server.
+
+Defines database models, Pydantic schemas, and data structures
+for RAG operations with pgvector.
+"""
+
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class SearchType(str, Enum):
+    """Types of search supported."""
+
+    SEMANTIC = "semantic"  # Vector similarity search
+    KEYWORD = "keyword"  # Full-text search
+    HYBRID = "hybrid"  # Combined semantic + keyword
+
+
+class ChunkType(str, Enum):
+    """Types of content chunks."""
+
+    CODE = "code"
+    MARKDOWN = "markdown"
+    TEXT = "text"
+    DOCUMENTATION = "documentation"
+
+
+class FileType(str, Enum):
+    """Supported file types for chunking."""
+
+    PYTHON = "python"
+    JAVASCRIPT = "javascript"
+    TYPESCRIPT = "typescript"
+    GO = "go"
+    RUST = "rust"
+    JAVA = "java"
+    MARKDOWN = "markdown"
+    TEXT = "text"
+    JSON = "json"
+    YAML = "yaml"
+    TOML = "toml"
+
+
+# File extension to FileType mapping
+FILE_EXTENSION_MAP: dict[str, FileType] = {
+    ".py": FileType.PYTHON,
+    ".js": FileType.JAVASCRIPT,
+    ".jsx": FileType.JAVASCRIPT,
+    ".ts": FileType.TYPESCRIPT,
+    ".tsx": FileType.TYPESCRIPT,
+    ".go": FileType.GO,
+    ".rs": FileType.RUST,
+    ".java": FileType.JAVA,
+    ".md": FileType.MARKDOWN,
+    ".mdx": FileType.MARKDOWN,
+    ".txt": FileType.TEXT,
+    ".json": FileType.JSON,
+    ".yaml": FileType.YAML,
+    ".yml": FileType.YAML,
+    ".toml": FileType.TOML,
+}
+
+
+@dataclass
+class Chunk:
+    """A chunk of content ready for embedding."""
+
+    content: str
+    chunk_type: ChunkType
+    file_type: FileType | None = None
+    source_path: str | None = None
+    start_line: int | None = None
+    end_line: int | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    token_count: int = 0
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "content": self.content,
+            "chunk_type": self.chunk_type.value,
+            "file_type": self.file_type.value if self.file_type else None,
+            "source_path": self.source_path,
+            "start_line": self.start_line,
+            "end_line": self.end_line,
+            "metadata": self.metadata,
+            "token_count": self.token_count,
+        }
+
+
+@dataclass
+class KnowledgeEmbedding:
+    """
+    A knowledge embedding stored in the database.
+
+    Represents a chunk of content with its vector embedding.
+    """
+
+    id: str
+    project_id: str
+    collection: str
+    content: str
+    embedding: list[float]
+    chunk_type: ChunkType
+    source_path: str | None = None
+    start_line: int | None = None
+    end_line: int | None = None
+    file_type: FileType | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    content_hash: str | None = None
+    created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    expires_at: datetime | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary (excluding embedding for size)."""
+        return {
+            "id": self.id,
+            "project_id": self.project_id,
+            "collection": self.collection,
+            "content": self.content,
+            "chunk_type": self.chunk_type.value,
+            "source_path": self.source_path,
+            "start_line": self.start_line,
+            "end_line": self.end_line,
+            "file_type": self.file_type.value if self.file_type else None,
+            "metadata": self.metadata,
+            "content_hash": self.content_hash,
+            "created_at": self.created_at.isoformat(),
+            "updated_at": self.updated_at.isoformat(),
+            "expires_at": self.expires_at.isoformat() if self.expires_at else None,
+        }
+
+
+# Pydantic Request/Response Models
+
+
+class IngestRequest(BaseModel):
+    """Request to ingest content into the knowledge base."""
+
+    project_id: str = Field(..., description="Project ID for scoping")
+    agent_id: str = Field(..., description="Agent ID making the request")
+    content: str = Field(..., description="Content to ingest")
+    source_path: str | None = Field(
+        default=None, description="Source file path for reference"
+    )
+    collection: str = Field(
+        default="default", description="Collection to store in"
+    )
+    chunk_type: ChunkType = Field(
+        default=ChunkType.TEXT, description="Type of content"
+    )
+    file_type: FileType | None = Field(
+        default=None, description="File type for code chunking"
+    )
+    metadata: dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
+
+
+class IngestResult(BaseModel):
+    """Result of an ingest operation."""
+
+    success: bool = Field(..., description="Whether ingest succeeded")
+    chunks_created: int = Field(default=0, description="Number of chunks created")
+    embeddings_generated: int = Field(
+        default=0, description="Number of embeddings generated"
+    )
+    source_path: str | None = Field(default=None, description="Source path ingested")
+    collection: str = Field(default="default", description="Collection stored in")
+    chunk_ids: list[str] = Field(
+        default_factory=list, description="IDs of created chunks"
+    )
+    error: str | None = Field(default=None, description="Error message if failed")
+
+
+class SearchRequest(BaseModel):
+    """Request to search the knowledge base."""
+
+    project_id: str = Field(..., description="Project ID for scoping")
+    agent_id: str = Field(..., description="Agent ID making the request")
+    query: str = Field(..., description="Search query")
+    search_type: SearchType = Field(
+        default=SearchType.HYBRID, description="Type of search"
+    )
+    collection: str | None = Field(
+        default=None, description="Collection to search (None = all)"
+    )
+    limit: int = Field(default=10, ge=1, le=100, description="Max results")
+    threshold: float = Field(
+        default=0.7, ge=0.0, le=1.0, description="Minimum similarity score"
+    )
+    file_types: list[FileType] | None = Field(
+        default=None, description="Filter by file types"
+    )
+    include_metadata: bool = Field(
+        default=True, description="Include metadata in results"
+    )
+
+
+class SearchResult(BaseModel):
+    """A single search result."""
+
+    id: str = Field(..., description="Chunk ID")
+    content: str = Field(..., description="Chunk content")
+    score: float = Field(..., description="Relevance score (0-1)")
+    source_path: str | None = Field(default=None, description="Source file path")
+    start_line: int | None = Field(default=None, description="Start line in source")
+    end_line: int | None = Field(default=None, description="End line in source")
+    chunk_type: str = Field(..., description="Type of chunk")
+    file_type: str | None = Field(default=None, description="File type")
+    collection: str = Field(..., description="Collection name")
+    metadata: dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
+
+    @classmethod
+    def from_embedding(
+        cls, embedding: KnowledgeEmbedding, score: float
+    ) -> "SearchResult":
+        """Create SearchResult from KnowledgeEmbedding."""
+        return cls(
+            id=embedding.id,
+            content=embedding.content,
+            score=score,
+            source_path=embedding.source_path,
+            start_line=embedding.start_line,
+            end_line=embedding.end_line,
+            chunk_type=embedding.chunk_type.value,
+            file_type=embedding.file_type.value if embedding.file_type else None,
+            collection=embedding.collection,
+            metadata=embedding.metadata,
+        )
+
+
+class SearchResponse(BaseModel):
+    """Response from a search operation."""
+
+    query: str = Field(..., description="Original query")
+    search_type: str = Field(..., description="Type of search performed")
+    results: list[SearchResult] = Field(
+        default_factory=list, description="Search results"
+    )
+    total_results: int = Field(default=0, description="Total results found")
+    search_time_ms: float = Field(default=0.0, description="Search time in ms")
+
+
+class DeleteRequest(BaseModel):
+    """Request to delete from the knowledge base."""
+
+    project_id: str = Field(..., description="Project ID for scoping")
+    agent_id: str = Field(..., description="Agent ID making the request")
+    source_path: str | None = Field(
+        default=None, description="Delete by source path"
+    )
+    collection: str | None = Field(
+        default=None, description="Delete entire collection"
+    )
+    chunk_ids: list[str] | None = Field(
+        default=None, description="Delete specific chunks"
+    )
+
+
+class DeleteResult(BaseModel):
+    """Result of a delete operation."""
+
+    success: bool = Field(..., description="Whether delete succeeded")
+    chunks_deleted: int = Field(default=0, description="Number of chunks deleted")
+    error: str | None = Field(default=None, description="Error message if failed")
+
+
+class CollectionInfo(BaseModel):
+    """Information about a collection."""
+
+    name: str = Field(..., description="Collection name")
+    project_id: str = Field(..., description="Project ID")
+    chunk_count: int = Field(default=0, description="Number of chunks")
+    total_tokens: int = Field(default=0, description="Total tokens stored")
+    file_types: list[str] = Field(
+        default_factory=list, description="File types in collection"
+    )
+    created_at: datetime = Field(..., description="Creation time")
+    updated_at: datetime = Field(..., description="Last update time")
+
+
+class ListCollectionsResponse(BaseModel):
+    """Response for listing collections."""
+
+    project_id: str = Field(..., description="Project ID")
+    collections: list[CollectionInfo] = Field(
+        default_factory=list, description="Collections in project"
+    )
+    total_collections: int = Field(default=0, description="Total count")
+
+
+class CollectionStats(BaseModel):
+    """Statistics for a collection."""
+
+    collection: str = Field(..., description="Collection name")
+    project_id: str = Field(..., description="Project ID")
+    chunk_count: int = Field(default=0, description="Number of chunks")
+    unique_sources: int = Field(default=0, description="Unique source files")
+    total_tokens: int = Field(default=0, description="Total tokens")
+    avg_chunk_size: float = Field(default=0.0, description="Average chunk size")
+    chunk_types: dict[str, int] = Field(
+        default_factory=dict, description="Count by chunk type"
+    )
+    file_types: dict[str, int] = Field(
+        default_factory=dict, description="Count by file type"
+    )
+    oldest_chunk: datetime | None = Field(
+        default=None, description="Oldest chunk timestamp"
+    )
+    newest_chunk: datetime | None = Field(
+        default=None, description="Newest chunk timestamp"
+    )