forked from cardosofelipe/fast-next-template
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
322 lines
11 KiB
Python
322 lines
11 KiB
Python
"""
|
|
Data models for Knowledge Base MCP Server.
|
|
|
|
Defines database models, Pydantic schemas, and data structures
|
|
for RAG operations with pgvector.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class SearchType(str, Enum):
|
|
"""Types of search supported."""
|
|
|
|
SEMANTIC = "semantic" # Vector similarity search
|
|
KEYWORD = "keyword" # Full-text search
|
|
HYBRID = "hybrid" # Combined semantic + keyword
|
|
|
|
|
|
class ChunkType(str, Enum):
|
|
"""Types of content chunks."""
|
|
|
|
CODE = "code"
|
|
MARKDOWN = "markdown"
|
|
TEXT = "text"
|
|
DOCUMENTATION = "documentation"
|
|
|
|
|
|
class FileType(str, Enum):
|
|
"""Supported file types for chunking."""
|
|
|
|
PYTHON = "python"
|
|
JAVASCRIPT = "javascript"
|
|
TYPESCRIPT = "typescript"
|
|
GO = "go"
|
|
RUST = "rust"
|
|
JAVA = "java"
|
|
MARKDOWN = "markdown"
|
|
TEXT = "text"
|
|
JSON = "json"
|
|
YAML = "yaml"
|
|
TOML = "toml"
|
|
|
|
|
|
# File extension to FileType mapping
|
|
FILE_EXTENSION_MAP: dict[str, FileType] = {
|
|
".py": FileType.PYTHON,
|
|
".js": FileType.JAVASCRIPT,
|
|
".jsx": FileType.JAVASCRIPT,
|
|
".ts": FileType.TYPESCRIPT,
|
|
".tsx": FileType.TYPESCRIPT,
|
|
".go": FileType.GO,
|
|
".rs": FileType.RUST,
|
|
".java": FileType.JAVA,
|
|
".md": FileType.MARKDOWN,
|
|
".mdx": FileType.MARKDOWN,
|
|
".txt": FileType.TEXT,
|
|
".json": FileType.JSON,
|
|
".yaml": FileType.YAML,
|
|
".yml": FileType.YAML,
|
|
".toml": FileType.TOML,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Chunk:
|
|
"""A chunk of content ready for embedding."""
|
|
|
|
content: str
|
|
chunk_type: ChunkType
|
|
file_type: FileType | None = None
|
|
source_path: str | None = None
|
|
start_line: int | None = None
|
|
end_line: int | None = None
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
token_count: int = 0
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary."""
|
|
return {
|
|
"content": self.content,
|
|
"chunk_type": self.chunk_type.value,
|
|
"file_type": self.file_type.value if self.file_type else None,
|
|
"source_path": self.source_path,
|
|
"start_line": self.start_line,
|
|
"end_line": self.end_line,
|
|
"metadata": self.metadata,
|
|
"token_count": self.token_count,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class KnowledgeEmbedding:
|
|
"""
|
|
A knowledge embedding stored in the database.
|
|
|
|
Represents a chunk of content with its vector embedding.
|
|
"""
|
|
|
|
id: str
|
|
project_id: str
|
|
collection: str
|
|
content: str
|
|
embedding: list[float]
|
|
chunk_type: ChunkType
|
|
source_path: str | None = None
|
|
start_line: int | None = None
|
|
end_line: int | None = None
|
|
file_type: FileType | None = None
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
content_hash: str | None = None
|
|
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
expires_at: datetime | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary (excluding embedding for size)."""
|
|
return {
|
|
"id": self.id,
|
|
"project_id": self.project_id,
|
|
"collection": self.collection,
|
|
"content": self.content,
|
|
"chunk_type": self.chunk_type.value,
|
|
"source_path": self.source_path,
|
|
"start_line": self.start_line,
|
|
"end_line": self.end_line,
|
|
"file_type": self.file_type.value if self.file_type else None,
|
|
"metadata": self.metadata,
|
|
"content_hash": self.content_hash,
|
|
"created_at": self.created_at.isoformat(),
|
|
"updated_at": self.updated_at.isoformat(),
|
|
"expires_at": self.expires_at.isoformat() if self.expires_at else None,
|
|
}
|
|
|
|
|
|
# Pydantic Request/Response Models
|
|
|
|
|
|
class IngestRequest(BaseModel):
|
|
"""Request to ingest content into the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
content: str = Field(..., description="Content to ingest")
|
|
source_path: str | None = Field(
|
|
default=None, description="Source file path for reference"
|
|
)
|
|
collection: str = Field(
|
|
default="default", description="Collection to store in"
|
|
)
|
|
chunk_type: ChunkType = Field(
|
|
default=ChunkType.TEXT, description="Type of content"
|
|
)
|
|
file_type: FileType | None = Field(
|
|
default=None, description="File type for code chunking"
|
|
)
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional metadata"
|
|
)
|
|
|
|
|
|
class IngestResult(BaseModel):
|
|
"""Result of an ingest operation."""
|
|
|
|
success: bool = Field(..., description="Whether ingest succeeded")
|
|
chunks_created: int = Field(default=0, description="Number of chunks created")
|
|
embeddings_generated: int = Field(
|
|
default=0, description="Number of embeddings generated"
|
|
)
|
|
source_path: str | None = Field(default=None, description="Source path ingested")
|
|
collection: str = Field(default="default", description="Collection stored in")
|
|
chunk_ids: list[str] = Field(
|
|
default_factory=list, description="IDs of created chunks"
|
|
)
|
|
error: str | None = Field(default=None, description="Error message if failed")
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
"""Request to search the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
query: str = Field(..., description="Search query")
|
|
search_type: SearchType = Field(
|
|
default=SearchType.HYBRID, description="Type of search"
|
|
)
|
|
collection: str | None = Field(
|
|
default=None, description="Collection to search (None = all)"
|
|
)
|
|
limit: int = Field(default=10, ge=1, le=100, description="Max results")
|
|
threshold: float = Field(
|
|
default=0.7, ge=0.0, le=1.0, description="Minimum similarity score"
|
|
)
|
|
file_types: list[FileType] | None = Field(
|
|
default=None, description="Filter by file types"
|
|
)
|
|
include_metadata: bool = Field(
|
|
default=True, description="Include metadata in results"
|
|
)
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
"""A single search result."""
|
|
|
|
id: str = Field(..., description="Chunk ID")
|
|
content: str = Field(..., description="Chunk content")
|
|
score: float = Field(..., description="Relevance score (0-1)")
|
|
source_path: str | None = Field(default=None, description="Source file path")
|
|
start_line: int | None = Field(default=None, description="Start line in source")
|
|
end_line: int | None = Field(default=None, description="End line in source")
|
|
chunk_type: str = Field(..., description="Type of chunk")
|
|
file_type: str | None = Field(default=None, description="File type")
|
|
collection: str = Field(..., description="Collection name")
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional metadata"
|
|
)
|
|
|
|
@classmethod
|
|
def from_embedding(
|
|
cls, embedding: KnowledgeEmbedding, score: float
|
|
) -> "SearchResult":
|
|
"""Create SearchResult from KnowledgeEmbedding."""
|
|
return cls(
|
|
id=embedding.id,
|
|
content=embedding.content,
|
|
score=score,
|
|
source_path=embedding.source_path,
|
|
start_line=embedding.start_line,
|
|
end_line=embedding.end_line,
|
|
chunk_type=embedding.chunk_type.value,
|
|
file_type=embedding.file_type.value if embedding.file_type else None,
|
|
collection=embedding.collection,
|
|
metadata=embedding.metadata,
|
|
)
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
"""Response from a search operation."""
|
|
|
|
query: str = Field(..., description="Original query")
|
|
search_type: str = Field(..., description="Type of search performed")
|
|
results: list[SearchResult] = Field(
|
|
default_factory=list, description="Search results"
|
|
)
|
|
total_results: int = Field(default=0, description="Total results found")
|
|
search_time_ms: float = Field(default=0.0, description="Search time in ms")
|
|
|
|
|
|
class DeleteRequest(BaseModel):
|
|
"""Request to delete from the knowledge base."""
|
|
|
|
project_id: str = Field(..., description="Project ID for scoping")
|
|
agent_id: str = Field(..., description="Agent ID making the request")
|
|
source_path: str | None = Field(
|
|
default=None, description="Delete by source path"
|
|
)
|
|
collection: str | None = Field(
|
|
default=None, description="Delete entire collection"
|
|
)
|
|
chunk_ids: list[str] | None = Field(
|
|
default=None, description="Delete specific chunks"
|
|
)
|
|
|
|
|
|
class DeleteResult(BaseModel):
|
|
"""Result of a delete operation."""
|
|
|
|
success: bool = Field(..., description="Whether delete succeeded")
|
|
chunks_deleted: int = Field(default=0, description="Number of chunks deleted")
|
|
error: str | None = Field(default=None, description="Error message if failed")
|
|
|
|
|
|
class CollectionInfo(BaseModel):
|
|
"""Information about a collection."""
|
|
|
|
name: str = Field(..., description="Collection name")
|
|
project_id: str = Field(..., description="Project ID")
|
|
chunk_count: int = Field(default=0, description="Number of chunks")
|
|
total_tokens: int = Field(default=0, description="Total tokens stored")
|
|
file_types: list[str] = Field(
|
|
default_factory=list, description="File types in collection"
|
|
)
|
|
created_at: datetime = Field(..., description="Creation time")
|
|
updated_at: datetime = Field(..., description="Last update time")
|
|
|
|
|
|
class ListCollectionsResponse(BaseModel):
|
|
"""Response for listing collections."""
|
|
|
|
project_id: str = Field(..., description="Project ID")
|
|
collections: list[CollectionInfo] = Field(
|
|
default_factory=list, description="Collections in project"
|
|
)
|
|
total_collections: int = Field(default=0, description="Total count")
|
|
|
|
|
|
class CollectionStats(BaseModel):
|
|
"""Statistics for a collection."""
|
|
|
|
collection: str = Field(..., description="Collection name")
|
|
project_id: str = Field(..., description="Project ID")
|
|
chunk_count: int = Field(default=0, description="Number of chunks")
|
|
unique_sources: int = Field(default=0, description="Unique source files")
|
|
total_tokens: int = Field(default=0, description="Total tokens")
|
|
avg_chunk_size: float = Field(default=0.0, description="Average chunk size")
|
|
chunk_types: dict[str, int] = Field(
|
|
default_factory=dict, description="Count by chunk type"
|
|
)
|
|
file_types: dict[str, int] = Field(
|
|
default_factory=dict, description="Count by file type"
|
|
)
|
|
oldest_chunk: datetime | None = Field(
|
|
default=None, description="Oldest chunk timestamp"
|
|
)
|
|
newest_chunk: datetime | None = Field(
|
|
default=None, description="Newest chunk timestamp"
|
|
)
|