""" Data models for Knowledge Base MCP Server. Defines database models, Pydantic schemas, and data structures for RAG operations with pgvector. """ from dataclasses import dataclass, field from datetime import UTC, datetime from enum import Enum from typing import Any from pydantic import BaseModel, Field class SearchType(str, Enum): """Types of search supported.""" SEMANTIC = "semantic" # Vector similarity search KEYWORD = "keyword" # Full-text search HYBRID = "hybrid" # Combined semantic + keyword class ChunkType(str, Enum): """Types of content chunks.""" CODE = "code" MARKDOWN = "markdown" TEXT = "text" DOCUMENTATION = "documentation" class FileType(str, Enum): """Supported file types for chunking.""" PYTHON = "python" JAVASCRIPT = "javascript" TYPESCRIPT = "typescript" GO = "go" RUST = "rust" JAVA = "java" MARKDOWN = "markdown" TEXT = "text" JSON = "json" YAML = "yaml" TOML = "toml" # File extension to FileType mapping FILE_EXTENSION_MAP: dict[str, FileType] = { ".py": FileType.PYTHON, ".js": FileType.JAVASCRIPT, ".jsx": FileType.JAVASCRIPT, ".ts": FileType.TYPESCRIPT, ".tsx": FileType.TYPESCRIPT, ".go": FileType.GO, ".rs": FileType.RUST, ".java": FileType.JAVA, ".md": FileType.MARKDOWN, ".mdx": FileType.MARKDOWN, ".txt": FileType.TEXT, ".json": FileType.JSON, ".yaml": FileType.YAML, ".yml": FileType.YAML, ".toml": FileType.TOML, } @dataclass class Chunk: """A chunk of content ready for embedding.""" content: str chunk_type: ChunkType file_type: FileType | None = None source_path: str | None = None start_line: int | None = None end_line: int | None = None metadata: dict[str, Any] = field(default_factory=dict) token_count: int = 0 def to_dict(self) -> dict[str, Any]: """Convert to dictionary.""" return { "content": self.content, "chunk_type": self.chunk_type.value, "file_type": self.file_type.value if self.file_type else None, "source_path": self.source_path, "start_line": self.start_line, "end_line": self.end_line, "metadata": self.metadata, "token_count": self.token_count, } @dataclass class KnowledgeEmbedding: """ A knowledge embedding stored in the database. Represents a chunk of content with its vector embedding. """ id: str project_id: str collection: str content: str embedding: list[float] chunk_type: ChunkType source_path: str | None = None start_line: int | None = None end_line: int | None = None file_type: FileType | None = None metadata: dict[str, Any] = field(default_factory=dict) content_hash: str | None = None created_at: datetime = field(default_factory=lambda: datetime.now(UTC)) updated_at: datetime = field(default_factory=lambda: datetime.now(UTC)) expires_at: datetime | None = None def to_dict(self) -> dict[str, Any]: """Convert to dictionary (excluding embedding for size).""" return { "id": self.id, "project_id": self.project_id, "collection": self.collection, "content": self.content, "chunk_type": self.chunk_type.value, "source_path": self.source_path, "start_line": self.start_line, "end_line": self.end_line, "file_type": self.file_type.value if self.file_type else None, "metadata": self.metadata, "content_hash": self.content_hash, "created_at": self.created_at.isoformat(), "updated_at": self.updated_at.isoformat(), "expires_at": self.expires_at.isoformat() if self.expires_at else None, } # Pydantic Request/Response Models class IngestRequest(BaseModel): """Request to ingest content into the knowledge base.""" project_id: str = Field(..., description="Project ID for scoping") agent_id: str = Field(..., description="Agent ID making the request") content: str = Field(..., description="Content to ingest") source_path: str | None = Field( default=None, description="Source file path for reference" ) collection: str = Field( default="default", description="Collection to store in" ) chunk_type: ChunkType = Field( default=ChunkType.TEXT, description="Type of content" ) file_type: FileType | None = Field( default=None, description="File type for code chunking" ) metadata: dict[str, Any] = Field( default_factory=dict, description="Additional metadata" ) class IngestResult(BaseModel): """Result of an ingest operation.""" success: bool = Field(..., description="Whether ingest succeeded") chunks_created: int = Field(default=0, description="Number of chunks created") embeddings_generated: int = Field( default=0, description="Number of embeddings generated" ) source_path: str | None = Field(default=None, description="Source path ingested") collection: str = Field(default="default", description="Collection stored in") chunk_ids: list[str] = Field( default_factory=list, description="IDs of created chunks" ) error: str | None = Field(default=None, description="Error message if failed") class SearchRequest(BaseModel): """Request to search the knowledge base.""" project_id: str = Field(..., description="Project ID for scoping") agent_id: str = Field(..., description="Agent ID making the request") query: str = Field(..., description="Search query") search_type: SearchType = Field( default=SearchType.HYBRID, description="Type of search" ) collection: str | None = Field( default=None, description="Collection to search (None = all)" ) limit: int = Field(default=10, ge=1, le=100, description="Max results") threshold: float = Field( default=0.7, ge=0.0, le=1.0, description="Minimum similarity score" ) file_types: list[FileType] | None = Field( default=None, description="Filter by file types" ) include_metadata: bool = Field( default=True, description="Include metadata in results" ) class SearchResult(BaseModel): """A single search result.""" id: str = Field(..., description="Chunk ID") content: str = Field(..., description="Chunk content") score: float = Field(..., description="Relevance score (0-1)") source_path: str | None = Field(default=None, description="Source file path") start_line: int | None = Field(default=None, description="Start line in source") end_line: int | None = Field(default=None, description="End line in source") chunk_type: str = Field(..., description="Type of chunk") file_type: str | None = Field(default=None, description="File type") collection: str = Field(..., description="Collection name") metadata: dict[str, Any] = Field( default_factory=dict, description="Additional metadata" ) @classmethod def from_embedding( cls, embedding: KnowledgeEmbedding, score: float ) -> "SearchResult": """Create SearchResult from KnowledgeEmbedding.""" return cls( id=embedding.id, content=embedding.content, score=score, source_path=embedding.source_path, start_line=embedding.start_line, end_line=embedding.end_line, chunk_type=embedding.chunk_type.value, file_type=embedding.file_type.value if embedding.file_type else None, collection=embedding.collection, metadata=embedding.metadata, ) class SearchResponse(BaseModel): """Response from a search operation.""" query: str = Field(..., description="Original query") search_type: str = Field(..., description="Type of search performed") results: list[SearchResult] = Field( default_factory=list, description="Search results" ) total_results: int = Field(default=0, description="Total results found") search_time_ms: float = Field(default=0.0, description="Search time in ms") class DeleteRequest(BaseModel): """Request to delete from the knowledge base.""" project_id: str = Field(..., description="Project ID for scoping") agent_id: str = Field(..., description="Agent ID making the request") source_path: str | None = Field( default=None, description="Delete by source path" ) collection: str | None = Field( default=None, description="Delete entire collection" ) chunk_ids: list[str] | None = Field( default=None, description="Delete specific chunks" ) class DeleteResult(BaseModel): """Result of a delete operation.""" success: bool = Field(..., description="Whether delete succeeded") chunks_deleted: int = Field(default=0, description="Number of chunks deleted") error: str | None = Field(default=None, description="Error message if failed") class CollectionInfo(BaseModel): """Information about a collection.""" name: str = Field(..., description="Collection name") project_id: str = Field(..., description="Project ID") chunk_count: int = Field(default=0, description="Number of chunks") total_tokens: int = Field(default=0, description="Total tokens stored") file_types: list[str] = Field( default_factory=list, description="File types in collection" ) created_at: datetime = Field(..., description="Creation time") updated_at: datetime = Field(..., description="Last update time") class ListCollectionsResponse(BaseModel): """Response for listing collections.""" project_id: str = Field(..., description="Project ID") collections: list[CollectionInfo] = Field( default_factory=list, description="Collections in project" ) total_collections: int = Field(default=0, description="Total count") class CollectionStats(BaseModel): """Statistics for a collection.""" collection: str = Field(..., description="Collection name") project_id: str = Field(..., description="Project ID") chunk_count: int = Field(default=0, description="Number of chunks") unique_sources: int = Field(default=0, description="Unique source files") total_tokens: int = Field(default=0, description="Total tokens") avg_chunk_size: float = Field(default=0.0, description="Average chunk size") chunk_types: dict[str, int] = Field( default_factory=dict, description="Count by chunk type" ) file_types: dict[str, int] = Field( default_factory=dict, description="Count by file type" ) oldest_chunk: datetime | None = Field( default=None, description="Oldest chunk timestamp" ) newest_chunk: datetime | None = Field( default=None, description="Newest chunk timestamp" )