forked from cardosofelipe/fast-next-template
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
314 lines
11 KiB
Python
314 lines
11 KiB
Python
"""
|
|
Data models for Knowledge Base MCP Server.
|
|
|
|
Defines database models, Pydantic schemas, and data structures
|
|
for RAG operations with pgvector.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class SearchType(str, Enum):
|
|
"""Types of search supported."""
|
|
|
|
SEMANTIC = "semantic" # Vector similarity search
|
|
KEYWORD = "keyword" # Full-text search
|
|
HYBRID = "hybrid" # Combined semantic + keyword
|
|
|
|
|
|
class ChunkType(str, Enum):
|
|
"""Types of content chunks."""
|
|
|
|
CODE = "code"
|
|
MARKDOWN = "markdown"
|
|
TEXT = "text"
|
|
DOCUMENTATION = "documentation"
|
|
|
|
|
|
class FileType(str, Enum):
|
|
"""Supported file types for chunking."""
|
|
|
|
PYTHON = "python"
|
|
JAVASCRIPT = "javascript"
|
|
TYPESCRIPT = "typescript"
|
|
GO = "go"
|
|
RUST = "rust"
|
|
JAVA = "java"
|
|
MARKDOWN = "markdown"
|
|
TEXT = "text"
|
|
JSON = "json"
|
|
YAML = "yaml"
|
|
TOML = "toml"
|
|
|
|
|
|
# File extension to FileType mapping
|
|
FILE_EXTENSION_MAP: dict[str, FileType] = {
|
|
".py": FileType.PYTHON,
|
|
".js": FileType.JAVASCRIPT,
|
|
".jsx": FileType.JAVASCRIPT,
|
|
".ts": FileType.TYPESCRIPT,
|
|
".tsx": FileType.TYPESCRIPT,
|
|
".go": FileType.GO,
|
|
".rs": FileType.RUST,
|
|
".java": FileType.JAVA,
|
|
".md": FileType.MARKDOWN,
|
|
".mdx": FileType.MARKDOWN,
|
|
".txt": FileType.TEXT,
|
|
".json": FileType.JSON,
|
|
".yaml": FileType.YAML,
|
|
".yml": FileType.YAML,
|
|
".toml": FileType.TOML,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Chunk:
|
|
"""A chunk of content ready for embedding."""
|
|
|
|
content: str
|
|
chunk_type: ChunkType
|
|
file_type: FileType | None = None
|
|
source_path: str | None = None
|
|
start_line: int | None = None
|
|
end_line: int | None = None
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
token_count: int = 0
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary."""
|
|
return {
|
|
"content": self.content,
|
|
"chunk_type": self.chunk_type.value,
|
|
"file_type": self.file_type.value if self.file_type else None,
|
|
"source_path": self.source_path,
|
|
"start_line": self.start_line,
|
|
"end_line": self.end_line,
|
|
"metadata": self.metadata,
|
|
"token_count": self.token_count,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class KnowledgeEmbedding:
|
|
"""
|
|
A knowledge embedding stored in the database.
|
|
|
|
Represents a chunk of content with its vector embedding.
|
|
"""
|
|
|
|
id: str
|
|
project_id: str
|
|
collection: str
|
|
content: str
|
|
embedding: list[float]
|
|
chunk_type: ChunkType
|
|
source_path: str | None = None
|
|
start_line: int | None = None
|
|
end_line: int | None = None
|
|
file_type: FileType | None = None
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
content_hash: str | None = None
|
|
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
expires_at: datetime | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary (excluding embedding for size)."""
|
|
return {
|
|
"id": self.id,
|
|
"project_id": self.project_id,
|
|
"collection": self.collection,
|
|
"content": self.content,
|
|
"chunk_type": self.chunk_type.value,
|
|
"source_path": self.source_path,
|
|
"start_line": self.start_line,
|
|
"end_line": self.end_line,
|
|
"file_type": self.file_type.value if self.file_type else None,
|
|
"metadata": self.metadata,
|
|
"content_hash": self.content_hash,
|
|
"created_at": self.created_at.isoformat(),
|
|
"updated_at": self.updated_at.isoformat(),
|
|
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
|
|
}
|
|
|
|
|
|
# Pydantic Request/Response Models
|
|
|
|
|
|
class IngestRequest(BaseModel):
|
|
"""Request to ingest content into the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
content: str = Field(..., description="Content to ingest")
|
|
source_path: str | None = Field(
|
|
default=None, description="Source file path for reference"
|
|
)
|
|
collection: str = Field(default="default", description="Collection to store in")
|
|
chunk_type: ChunkType = Field(default=ChunkType.TEXT, description="Type of content")
|
|
file_type: FileType | None = Field(
|
|
default=None, description="File type for code chunking"
|
|
)
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional metadata"
|
|
)
|
|
|
|
|
|
class IngestResult(BaseModel):
|
|
"""Result of an ingest operation."""
|
|
|
|
success: bool = Field(..., description="Whether ingest succeeded")
|
|
chunks_created: int = Field(default=0, description="Number of chunks created")
|
|
embeddings_generated: int = Field(
|
|
default=0, description="Number of embeddings generated"
|
|
)
|
|
source_path: str | None = Field(default=None, description="Source path ingested")
|
|
collection: str = Field(default="default", description="Collection stored in")
|
|
chunk_ids: list[str] = Field(
|
|
default_factory=list, description="IDs of created chunks"
|
|
)
|
|
error: str | None = Field(default=None, description="Error message if failed")
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
"""Request to search the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
query: str = Field(..., description="Search query")
|
|
search_type: SearchType = Field(
|
|
default=SearchType.HYBRID, description="Type of search"
|
|
)
|
|
collection: str | None = Field(
|
|
default=None, description="Collection to search (None = all)"
|
|
)
|
|
limit: int = Field(default=10, ge=1, le=100, description="Max results")
|
|
threshold: float = Field(
|
|
default=0.7, ge=0.0, le=1.0, description="Minimum similarity score"
|
|
)
|
|
file_types: list[FileType] | None = Field(
|
|
default=None, description="Filter by file types"
|
|
)
|
|
include_metadata: bool = Field(
|
|
default=True, description="Include metadata in results"
|
|
)
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
"""A single search result."""
|
|
|
|
id: str = Field(..., description="Chunk ID")
|
|
content: str = Field(..., description="Chunk content")
|
|
score: float = Field(..., description="Relevance score (0-1)")
|
|
source_path: str | None = Field(default=None, description="Source file path")
|
|
start_line: int | None = Field(default=None, description="Start line in source")
|
|
end_line: int | None = Field(default=None, description="End line in source")
|
|
chunk_type: str = Field(..., description="Type of chunk")
|
|
file_type: str | None = Field(default=None, description="File type")
|
|
collection: str = Field(..., description="Collection name")
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional metadata"
|
|
)
|
|
|
|
@classmethod
|
|
def from_embedding(
|
|
cls, embedding: KnowledgeEmbedding, score: float
|
|
) -> "SearchResult":
|
|
"""Create SearchResult from KnowledgeEmbedding."""
|
|
return cls(
|
|
id=embedding.id,
|
|
content=embedding.content,
|
|
score=score,
|
|
source_path=embedding.source_path,
|
|
start_line=embedding.start_line,
|
|
end_line=embedding.end_line,
|
|
chunk_type=embedding.chunk_type.value,
|
|
file_type=embedding.file_type.value if embedding.file_type else None,
|
|
collection=embedding.collection,
|
|
metadata=embedding.metadata,
|
|
)
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
"""Response from a search operation."""
|
|
|
|
query: str = Field(..., description="Original query")
|
|
search_type: str = Field(..., description="Type of search performed")
|
|
results: list[SearchResult] = Field(
|
|
default_factory=list, description="Search results"
|
|
)
|
|
total_results: int = Field(default=0, description="Total results found")
|
|
search_time_ms: float = Field(default=0.0, description="Search time in ms")
|
|
|
|
|
|
class DeleteRequest(BaseModel):
|
|
"""Request to delete from the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
source_path: str | None = Field(default=None, description="Delete by source path")
|
|
collection: str | None = Field(default=None, description="Delete entire collection")
|
|
chunk_ids: list[str] | None = Field(
|
|
default=None, description="Delete specific chunks"
|
|
)
|
|
|
|
|
|
class DeleteResult(BaseModel):
|
|
"""Result of a delete operation."""
|
|
|
|
success: bool = Field(..., description="Whether delete succeeded")
|
|
chunks_deleted: int = Field(default=0, description="Number of chunks deleted")
|
|
error: str | None = Field(default=None, description="Error message if failed")
|
|
|
|
|
|
class CollectionInfo(BaseModel):
|
|
"""Information about a collection."""
|
|
|
|
name: str = Field(..., description="Collection name")
|
|
project_id: str = Field(..., description="Project ID")
|
|
chunk_count: int = Field(default=0, description="Number of chunks")
|
|
total_tokens: int = Field(default=0, description="Total tokens stored")
|
|
file_types: list[str] = Field(
|
|
default_factory=list, description="File types in collection"
|
|
)
|
|
created_at: datetime = Field(..., description="Creation time")
|
|
updated_at: datetime = Field(..., description="Last update time")
|
|
|
|
|
|
class ListCollectionsResponse(BaseModel):
|
|
"""Response for listing collections."""
|
|
|
|
project_id: str = Field(..., description="Project ID")
|
|
collections: list[CollectionInfo] = Field(
|
|
default_factory=list, description="Collections in project"
|
|
)
|
|
total_collections: int = Field(default=0, description="Total count")
|
|
|
|
|
|
class CollectionStats(BaseModel):
|
|
"""Statistics for a collection."""
|
|
|
|
collection: str = Field(..., description="Collection name")
|
|
project_id: str = Field(..., description="Project ID")
|
|
chunk_count: int = Field(default=0, description="Number of chunks")
|
|
unique_sources: int = Field(default=0, description="Unique source files")
|
|
total_tokens: int = Field(default=0, description="Total tokens")
|
|
avg_chunk_size: float = Field(default=0.0, description="Average chunk size")
|
|
chunk_types: dict[str, int] = Field(
|
|
default_factory=dict, description="Count by chunk type"
|
|
)
|
|
file_types: dict[str, int] = Field(
|
|
default_factory=dict, description="Count by file type"
|
|
)
|
|
oldest_chunk: datetime | None = Field(
|
|
default=None, description="Oldest chunk timestamp"
|
|
)
|
|
newest_chunk: datetime | None = Field(
|
|
default=None, description="Newest chunk timestamp"
|
|
)
|