feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
321
mcp-servers/knowledge-base/models.py
Normal file
321
mcp-servers/knowledge-base/models.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Data models for Knowledge Base MCP Server.
|
||||
|
||||
Defines database models, Pydantic schemas, and data structures
|
||||
for RAG operations with pgvector.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class SearchType(str, Enum):
|
||||
"""Types of search supported."""
|
||||
|
||||
SEMANTIC = "semantic" # Vector similarity search
|
||||
KEYWORD = "keyword" # Full-text search
|
||||
HYBRID = "hybrid" # Combined semantic + keyword
|
||||
|
||||
|
||||
class ChunkType(str, Enum):
|
||||
"""Types of content chunks."""
|
||||
|
||||
CODE = "code"
|
||||
MARKDOWN = "markdown"
|
||||
TEXT = "text"
|
||||
DOCUMENTATION = "documentation"
|
||||
|
||||
|
||||
class FileType(str, Enum):
|
||||
"""Supported file types for chunking."""
|
||||
|
||||
PYTHON = "python"
|
||||
JAVASCRIPT = "javascript"
|
||||
TYPESCRIPT = "typescript"
|
||||
GO = "go"
|
||||
RUST = "rust"
|
||||
JAVA = "java"
|
||||
MARKDOWN = "markdown"
|
||||
TEXT = "text"
|
||||
JSON = "json"
|
||||
YAML = "yaml"
|
||||
TOML = "toml"
|
||||
|
||||
|
||||
# File extension to FileType mapping
|
||||
FILE_EXTENSION_MAP: dict[str, FileType] = {
|
||||
".py": FileType.PYTHON,
|
||||
".js": FileType.JAVASCRIPT,
|
||||
".jsx": FileType.JAVASCRIPT,
|
||||
".ts": FileType.TYPESCRIPT,
|
||||
".tsx": FileType.TYPESCRIPT,
|
||||
".go": FileType.GO,
|
||||
".rs": FileType.RUST,
|
||||
".java": FileType.JAVA,
|
||||
".md": FileType.MARKDOWN,
|
||||
".mdx": FileType.MARKDOWN,
|
||||
".txt": FileType.TEXT,
|
||||
".json": FileType.JSON,
|
||||
".yaml": FileType.YAML,
|
||||
".yml": FileType.YAML,
|
||||
".toml": FileType.TOML,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""A chunk of content ready for embedding."""
|
||||
|
||||
content: str
|
||||
chunk_type: ChunkType
|
||||
file_type: FileType | None = None
|
||||
source_path: str | None = None
|
||||
start_line: int | None = None
|
||||
end_line: int | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
token_count: int = 0
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"content": self.content,
|
||||
"chunk_type": self.chunk_type.value,
|
||||
"file_type": self.file_type.value if self.file_type else None,
|
||||
"source_path": self.source_path,
|
||||
"start_line": self.start_line,
|
||||
"end_line": self.end_line,
|
||||
"metadata": self.metadata,
|
||||
"token_count": self.token_count,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class KnowledgeEmbedding:
|
||||
"""
|
||||
A knowledge embedding stored in the database.
|
||||
|
||||
Represents a chunk of content with its vector embedding.
|
||||
"""
|
||||
|
||||
id: str
|
||||
project_id: str
|
||||
collection: str
|
||||
content: str
|
||||
embedding: list[float]
|
||||
chunk_type: ChunkType
|
||||
source_path: str | None = None
|
||||
start_line: int | None = None
|
||||
end_line: int | None = None
|
||||
file_type: FileType | None = None
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
content_hash: str | None = None
|
||||
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||||
updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||||
expires_at: datetime | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary (excluding embedding for size)."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"project_id": self.project_id,
|
||||
"collection": self.collection,
|
||||
"content": self.content,
|
||||
"chunk_type": self.chunk_type.value,
|
||||
"source_path": self.source_path,
|
||||
"start_line": self.start_line,
|
||||
"end_line": self.end_line,
|
||||
"file_type": self.file_type.value if self.file_type else None,
|
||||
"metadata": self.metadata,
|
||||
"content_hash": self.content_hash,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
"updated_at": self.updated_at.isoformat(),
|
||||
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
|
||||
}
|
||||
|
||||
|
||||
# Pydantic Request/Response Models
|
||||
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
"""Request to ingest content into the knowledge base."""
|
||||
|
||||
project_id: str = Field(..., description="Project ID for scoping")
|
||||
agent_id: str = Field(..., description="Agent ID making the request")
|
||||
content: str = Field(..., description="Content to ingest")
|
||||
source_path: str | None = Field(
|
||||
default=None, description="Source file path for reference"
|
||||
)
|
||||
collection: str = Field(
|
||||
default="default", description="Collection to store in"
|
||||
)
|
||||
chunk_type: ChunkType = Field(
|
||||
default=ChunkType.TEXT, description="Type of content"
|
||||
)
|
||||
file_type: FileType | None = Field(
|
||||
default=None, description="File type for code chunking"
|
||||
)
|
||||
metadata: dict[str, Any] = Field(
|
||||
default_factory=dict, description="Additional metadata"
|
||||
)
|
||||
|
||||
|
||||
class IngestResult(BaseModel):
|
||||
"""Result of an ingest operation."""
|
||||
|
||||
success: bool = Field(..., description="Whether ingest succeeded")
|
||||
chunks_created: int = Field(default=0, description="Number of chunks created")
|
||||
embeddings_generated: int = Field(
|
||||
default=0, description="Number of embeddings generated"
|
||||
)
|
||||
source_path: str | None = Field(default=None, description="Source path ingested")
|
||||
collection: str = Field(default="default", description="Collection stored in")
|
||||
chunk_ids: list[str] = Field(
|
||||
default_factory=list, description="IDs of created chunks"
|
||||
)
|
||||
error: str | None = Field(default=None, description="Error message if failed")
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
"""Request to search the knowledge base."""
|
||||
|
||||
project_id: str = Field(..., description="Project ID for scoping")
|
||||
agent_id: str = Field(..., description="Agent ID making the request")
|
||||
query: str = Field(..., description="Search query")
|
||||
search_type: SearchType = Field(
|
||||
default=SearchType.HYBRID, description="Type of search"
|
||||
)
|
||||
collection: str | None = Field(
|
||||
default=None, description="Collection to search (None = all)"
|
||||
)
|
||||
limit: int = Field(default=10, ge=1, le=100, description="Max results")
|
||||
threshold: float = Field(
|
||||
default=0.7, ge=0.0, le=1.0, description="Minimum similarity score"
|
||||
)
|
||||
file_types: list[FileType] | None = Field(
|
||||
default=None, description="Filter by file types"
|
||||
)
|
||||
include_metadata: bool = Field(
|
||||
default=True, description="Include metadata in results"
|
||||
)
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
"""A single search result."""
|
||||
|
||||
id: str = Field(..., description="Chunk ID")
|
||||
content: str = Field(..., description="Chunk content")
|
||||
score: float = Field(..., description="Relevance score (0-1)")
|
||||
source_path: str | None = Field(default=None, description="Source file path")
|
||||
start_line: int | None = Field(default=None, description="Start line in source")
|
||||
end_line: int | None = Field(default=None, description="End line in source")
|
||||
chunk_type: str = Field(..., description="Type of chunk")
|
||||
file_type: str | None = Field(default=None, description="File type")
|
||||
collection: str = Field(..., description="Collection name")
|
||||
metadata: dict[str, Any] = Field(
|
||||
default_factory=dict, description="Additional metadata"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_embedding(
|
||||
cls, embedding: KnowledgeEmbedding, score: float
|
||||
) -> "SearchResult":
|
||||
"""Create SearchResult from KnowledgeEmbedding."""
|
||||
return cls(
|
||||
id=embedding.id,
|
||||
content=embedding.content,
|
||||
score=score,
|
||||
source_path=embedding.source_path,
|
||||
start_line=embedding.start_line,
|
||||
end_line=embedding.end_line,
|
||||
chunk_type=embedding.chunk_type.value,
|
||||
file_type=embedding.file_type.value if embedding.file_type else None,
|
||||
collection=embedding.collection,
|
||||
metadata=embedding.metadata,
|
||||
)
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
"""Response from a search operation."""
|
||||
|
||||
query: str = Field(..., description="Original query")
|
||||
search_type: str = Field(..., description="Type of search performed")
|
||||
results: list[SearchResult] = Field(
|
||||
default_factory=list, description="Search results"
|
||||
)
|
||||
total_results: int = Field(default=0, description="Total results found")
|
||||
search_time_ms: float = Field(default=0.0, description="Search time in ms")
|
||||
|
||||
|
||||
class DeleteRequest(BaseModel):
|
||||
"""Request to delete from the knowledge base."""
|
||||
|
||||
project_id: str = Field(..., description="Project ID for scoping")
|
||||
agent_id: str = Field(..., description="Agent ID making the request")
|
||||
source_path: str | None = Field(
|
||||
default=None, description="Delete by source path"
|
||||
)
|
||||
collection: str | None = Field(
|
||||
default=None, description="Delete entire collection"
|
||||
)
|
||||
chunk_ids: list[str] | None = Field(
|
||||
default=None, description="Delete specific chunks"
|
||||
)
|
||||
|
||||
|
||||
class DeleteResult(BaseModel):
|
||||
"""Result of a delete operation."""
|
||||
|
||||
success: bool = Field(..., description="Whether delete succeeded")
|
||||
chunks_deleted: int = Field(default=0, description="Number of chunks deleted")
|
||||
error: str | None = Field(default=None, description="Error message if failed")
|
||||
|
||||
|
||||
class CollectionInfo(BaseModel):
|
||||
"""Information about a collection."""
|
||||
|
||||
name: str = Field(..., description="Collection name")
|
||||
project_id: str = Field(..., description="Project ID")
|
||||
chunk_count: int = Field(default=0, description="Number of chunks")
|
||||
total_tokens: int = Field(default=0, description="Total tokens stored")
|
||||
file_types: list[str] = Field(
|
||||
default_factory=list, description="File types in collection"
|
||||
)
|
||||
created_at: datetime = Field(..., description="Creation time")
|
||||
updated_at: datetime = Field(..., description="Last update time")
|
||||
|
||||
|
||||
class ListCollectionsResponse(BaseModel):
|
||||
"""Response for listing collections."""
|
||||
|
||||
project_id: str = Field(..., description="Project ID")
|
||||
collections: list[CollectionInfo] = Field(
|
||||
default_factory=list, description="Collections in project"
|
||||
)
|
||||
total_collections: int = Field(default=0, description="Total count")
|
||||
|
||||
|
||||
class CollectionStats(BaseModel):
|
||||
"""Statistics for a collection."""
|
||||
|
||||
collection: str = Field(..., description="Collection name")
|
||||
project_id: str = Field(..., description="Project ID")
|
||||
chunk_count: int = Field(default=0, description="Number of chunks")
|
||||
unique_sources: int = Field(default=0, description="Unique source files")
|
||||
total_tokens: int = Field(default=0, description="Total tokens")
|
||||
avg_chunk_size: float = Field(default=0.0, description="Average chunk size")
|
||||
chunk_types: dict[str, int] = Field(
|
||||
default_factory=dict, description="Count by chunk type"
|
||||
)
|
||||
file_types: dict[str, int] = Field(
|
||||
default_factory=dict, description="Count by file type"
|
||||
)
|
||||
oldest_chunk: datetime | None = Field(
|
||||
default=None, description="Oldest chunk timestamp"
|
||||
)
|
||||
newest_chunk: datetime | None = Field(
|
||||
default=None, description="Newest chunk timestamp"
|
||||
)
|
||||
Reference in New Issue
Block a user