Files
syndarix/mcp-servers/knowledge-base/models.py
Felipe Cardoso 51404216ae refactor(knowledge-base mcp server): adjust formatting for consistency and readability
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
2026-01-06 17:20:31 +01:00

314 lines
11 KiB
Python

"""
Data models for Knowledge Base MCP Server.
Defines database models, Pydantic schemas, and data structures
for RAG operations with pgvector.
"""
from dataclasses import dataclass, field
from datetime import UTC, datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class SearchType(str, Enum):
"""Types of search supported."""
SEMANTIC = "semantic" # Vector similarity search
KEYWORD = "keyword" # Full-text search
HYBRID = "hybrid" # Combined semantic + keyword
class ChunkType(str, Enum):
"""Types of content chunks."""
CODE = "code"
MARKDOWN = "markdown"
TEXT = "text"
DOCUMENTATION = "documentation"
class FileType(str, Enum):
"""Supported file types for chunking."""
PYTHON = "python"
JAVASCRIPT = "javascript"
TYPESCRIPT = "typescript"
GO = "go"
RUST = "rust"
JAVA = "java"
MARKDOWN = "markdown"
TEXT = "text"
JSON = "json"
YAML = "yaml"
TOML = "toml"
# File extension to FileType mapping
FILE_EXTENSION_MAP: dict[str, FileType] = {
".py": FileType.PYTHON,
".js": FileType.JAVASCRIPT,
".jsx": FileType.JAVASCRIPT,
".ts": FileType.TYPESCRIPT,
".tsx": FileType.TYPESCRIPT,
".go": FileType.GO,
".rs": FileType.RUST,
".java": FileType.JAVA,
".md": FileType.MARKDOWN,
".mdx": FileType.MARKDOWN,
".txt": FileType.TEXT,
".json": FileType.JSON,
".yaml": FileType.YAML,
".yml": FileType.YAML,
".toml": FileType.TOML,
}
@dataclass
class Chunk:
"""A chunk of content ready for embedding."""
content: str
chunk_type: ChunkType
file_type: FileType | None = None
source_path: str | None = None
start_line: int | None = None
end_line: int | None = None
metadata: dict[str, Any] = field(default_factory=dict)
token_count: int = 0
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary."""
return {
"content": self.content,
"chunk_type": self.chunk_type.value,
"file_type": self.file_type.value if self.file_type else None,
"source_path": self.source_path,
"start_line": self.start_line,
"end_line": self.end_line,
"metadata": self.metadata,
"token_count": self.token_count,
}
@dataclass
class KnowledgeEmbedding:
"""
A knowledge embedding stored in the database.
Represents a chunk of content with its vector embedding.
"""
id: str
project_id: str
collection: str
content: str
embedding: list[float]
chunk_type: ChunkType
source_path: str | None = None
start_line: int | None = None
end_line: int | None = None
file_type: FileType | None = None
metadata: dict[str, Any] = field(default_factory=dict)
content_hash: str | None = None
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
expires_at: datetime | None = None
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary (excluding embedding for size)."""
return {
"id": self.id,
"project_id": self.project_id,
"collection": self.collection,
"content": self.content,
"chunk_type": self.chunk_type.value,
"source_path": self.source_path,
"start_line": self.start_line,
"end_line": self.end_line,
"file_type": self.file_type.value if self.file_type else None,
"metadata": self.metadata,
"content_hash": self.content_hash,
"created_at": self.created_at.isoformat(),
"updated_at": self.updated_at.isoformat(),
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
}
# Pydantic Request/Response Models
class IngestRequest(BaseModel):
"""Request to ingest content into the knowledge base."""
project_id: str = Field(..., description="Project ID for scoping")
agent_id: str = Field(..., description="Agent ID making the request")
content: str = Field(..., description="Content to ingest")
source_path: str | None = Field(
default=None, description="Source file path for reference"
)
collection: str = Field(default="default", description="Collection to store in")
chunk_type: ChunkType = Field(default=ChunkType.TEXT, description="Type of content")
file_type: FileType | None = Field(
default=None, description="File type for code chunking"
)
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional metadata"
)
class IngestResult(BaseModel):
"""Result of an ingest operation."""
success: bool = Field(..., description="Whether ingest succeeded")
chunks_created: int = Field(default=0, description="Number of chunks created")
embeddings_generated: int = Field(
default=0, description="Number of embeddings generated"
)
source_path: str | None = Field(default=None, description="Source path ingested")
collection: str = Field(default="default", description="Collection stored in")
chunk_ids: list[str] = Field(
default_factory=list, description="IDs of created chunks"
)
error: str | None = Field(default=None, description="Error message if failed")
class SearchRequest(BaseModel):
"""Request to search the knowledge base."""
project_id: str = Field(..., description="Project ID for scoping")
agent_id: str = Field(..., description="Agent ID making the request")
query: str = Field(..., description="Search query")
search_type: SearchType = Field(
default=SearchType.HYBRID, description="Type of search"
)
collection: str | None = Field(
default=None, description="Collection to search (None = all)"
)
limit: int = Field(default=10, ge=1, le=100, description="Max results")
threshold: float = Field(
default=0.7, ge=0.0, le=1.0, description="Minimum similarity score"
)
file_types: list[FileType] | None = Field(
default=None, description="Filter by file types"
)
include_metadata: bool = Field(
default=True, description="Include metadata in results"
)
class SearchResult(BaseModel):
"""A single search result."""
id: str = Field(..., description="Chunk ID")
content: str = Field(..., description="Chunk content")
score: float = Field(..., description="Relevance score (0-1)")
source_path: str | None = Field(default=None, description="Source file path")
start_line: int | None = Field(default=None, description="Start line in source")
end_line: int | None = Field(default=None, description="End line in source")
chunk_type: str = Field(..., description="Type of chunk")
file_type: str | None = Field(default=None, description="File type")
collection: str = Field(..., description="Collection name")
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional metadata"
)
@classmethod
def from_embedding(
cls, embedding: KnowledgeEmbedding, score: float
) -> "SearchResult":
"""Create SearchResult from KnowledgeEmbedding."""
return cls(
id=embedding.id,
content=embedding.content,
score=score,
source_path=embedding.source_path,
start_line=embedding.start_line,
end_line=embedding.end_line,
chunk_type=embedding.chunk_type.value,
file_type=embedding.file_type.value if embedding.file_type else None,
collection=embedding.collection,
metadata=embedding.metadata,
)
class SearchResponse(BaseModel):
"""Response from a search operation."""
query: str = Field(..., description="Original query")
search_type: str = Field(..., description="Type of search performed")
results: list[SearchResult] = Field(
default_factory=list, description="Search results"
)
total_results: int = Field(default=0, description="Total results found")
search_time_ms: float = Field(default=0.0, description="Search time in ms")
class DeleteRequest(BaseModel):
"""Request to delete from the knowledge base."""
project_id: str = Field(..., description="Project ID for scoping")
agent_id: str = Field(..., description="Agent ID making the request")
source_path: str | None = Field(default=None, description="Delete by source path")
collection: str | None = Field(default=None, description="Delete entire collection")
chunk_ids: list[str] | None = Field(
default=None, description="Delete specific chunks"
)
class DeleteResult(BaseModel):
"""Result of a delete operation."""
success: bool = Field(..., description="Whether delete succeeded")
chunks_deleted: int = Field(default=0, description="Number of chunks deleted")
error: str | None = Field(default=None, description="Error message if failed")
class CollectionInfo(BaseModel):
"""Information about a collection."""
name: str = Field(..., description="Collection name")
project_id: str = Field(..., description="Project ID")
chunk_count: int = Field(default=0, description="Number of chunks")
total_tokens: int = Field(default=0, description="Total tokens stored")
file_types: list[str] = Field(
default_factory=list, description="File types in collection"
)
created_at: datetime = Field(..., description="Creation time")
updated_at: datetime = Field(..., description="Last update time")
class ListCollectionsResponse(BaseModel):
"""Response for listing collections."""
project_id: str = Field(..., description="Project ID")
collections: list[CollectionInfo] = Field(
default_factory=list, description="Collections in project"
)
total_collections: int = Field(default=0, description="Total count")
class CollectionStats(BaseModel):
"""Statistics for a collection."""
collection: str = Field(..., description="Collection name")
project_id: str = Field(..., description="Project ID")
chunk_count: int = Field(default=0, description="Number of chunks")
unique_sources: int = Field(default=0, description="Unique source files")
total_tokens: int = Field(default=0, description="Total tokens")
avg_chunk_size: float = Field(default=0.0, description="Average chunk size")
chunk_types: dict[str, int] = Field(
default_factory=dict, description="Count by chunk type"
)
file_types: dict[str, int] = Field(
default_factory=dict, description="Count by file type"
)
oldest_chunk: datetime | None = Field(
default=None, description="Oldest chunk timestamp"
)
newest_chunk: datetime | None = Field(
default=None, description="Newest chunk timestamp"
)