feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search:

- Intelligent chunking strategies (code-aware, markdown-aware, text)
- Semantic search with vector similarity (HNSW index)
- Keyword search with PostgreSQL full-text search
- Hybrid search using Reciprocal Rank Fusion (RRF)
- Redis caching for embeddings
- Collection management (ingest, search, delete, stats)
- FastMCP tools: search_knowledge, ingest_content, delete_content,
  list_collections, get_collection_stats, update_document

Testing:
- 128 comprehensive tests covering all components
- 58% code coverage (database integration tests use mocks)
- Passes ruff linting and mypy type checking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions

View File

@@ -0,0 +1,331 @@
"""
Collection management for Knowledge Base MCP Server.
Provides operations for managing document collections including
ingestion, deletion, and statistics.
"""
import logging
from typing import Any
from chunking.base import ChunkerFactory, get_chunker_factory
from config import Settings, get_settings
from database import DatabaseManager, get_database_manager
from embeddings import EmbeddingGenerator, get_embedding_generator
from models import (
ChunkType,
CollectionStats,
DeleteRequest,
DeleteResult,
FileType,
IngestRequest,
IngestResult,
ListCollectionsResponse,
)
logger = logging.getLogger(__name__)
class CollectionManager:
"""
Manages knowledge base collections.
Handles document ingestion, chunking, embedding generation,
and collection operations.
"""
def __init__(
self,
settings: Settings | None = None,
database: DatabaseManager | None = None,
embeddings: EmbeddingGenerator | None = None,
chunker_factory: ChunkerFactory | None = None,
) -> None:
"""Initialize collection manager."""
self._settings = settings or get_settings()
self._database = database
self._embeddings = embeddings
self._chunker_factory = chunker_factory
@property
def database(self) -> DatabaseManager:
"""Get database manager."""
if self._database is None:
self._database = get_database_manager()
return self._database
@property
def embeddings(self) -> EmbeddingGenerator:
"""Get embedding generator."""
if self._embeddings is None:
self._embeddings = get_embedding_generator()
return self._embeddings
@property
def chunker_factory(self) -> ChunkerFactory:
"""Get chunker factory."""
if self._chunker_factory is None:
self._chunker_factory = get_chunker_factory()
return self._chunker_factory
async def ingest(self, request: IngestRequest) -> IngestResult:
"""
Ingest content into the knowledge base.
Chunks the content, generates embeddings, and stores them.
Args:
request: Ingest request with content and options
Returns:
Ingest result with created chunk IDs
"""
try:
# Chunk the content
chunks = self.chunker_factory.chunk_content(
content=request.content,
source_path=request.source_path,
file_type=request.file_type,
chunk_type=request.chunk_type,
metadata=request.metadata,
)
if not chunks:
return IngestResult(
success=True,
chunks_created=0,
embeddings_generated=0,
source_path=request.source_path,
collection=request.collection,
chunk_ids=[],
)
# Extract chunk contents for embedding
chunk_texts = [chunk.content for chunk in chunks]
# Generate embeddings
embeddings_list = await self.embeddings.generate_batch(
texts=chunk_texts,
project_id=request.project_id,
agent_id=request.agent_id,
)
# Store embeddings
chunk_ids: list[str] = []
for chunk, embedding in zip(chunks, embeddings_list, strict=True):
# Build metadata with chunk info
chunk_metadata = {
**request.metadata,
**chunk.metadata,
"token_count": chunk.token_count,
}
chunk_id = await self.database.store_embedding(
project_id=request.project_id,
collection=request.collection,
content=chunk.content,
embedding=embedding,
chunk_type=chunk.chunk_type,
source_path=chunk.source_path or request.source_path,
start_line=chunk.start_line,
end_line=chunk.end_line,
file_type=chunk.file_type or request.file_type,
metadata=chunk_metadata,
)
chunk_ids.append(chunk_id)
logger.info(
f"Ingested {len(chunks)} chunks into collection '{request.collection}' "
f"for project {request.project_id}"
)
return IngestResult(
success=True,
chunks_created=len(chunks),
embeddings_generated=len(embeddings_list),
source_path=request.source_path,
collection=request.collection,
chunk_ids=chunk_ids,
)
except Exception as e:
logger.error(f"Ingest error: {e}")
return IngestResult(
success=False,
chunks_created=0,
embeddings_generated=0,
source_path=request.source_path,
collection=request.collection,
chunk_ids=[],
error=str(e),
)
async def delete(self, request: DeleteRequest) -> DeleteResult:
"""
Delete content from the knowledge base.
Supports deletion by source path, collection, or chunk IDs.
Args:
request: Delete request with target specification
Returns:
Delete result with count of deleted chunks
"""
try:
deleted_count = 0
if request.chunk_ids:
# Delete specific chunks
deleted_count = await self.database.delete_by_ids(
project_id=request.project_id,
chunk_ids=request.chunk_ids,
)
elif request.source_path:
# Delete by source path
deleted_count = await self.database.delete_by_source(
project_id=request.project_id,
source_path=request.source_path,
collection=request.collection,
)
elif request.collection:
# Delete entire collection
deleted_count = await self.database.delete_collection(
project_id=request.project_id,
collection=request.collection,
)
else:
return DeleteResult(
success=False,
chunks_deleted=0,
error="Must specify chunk_ids, source_path, or collection",
)
logger.info(
f"Deleted {deleted_count} chunks for project {request.project_id}"
)
return DeleteResult(
success=True,
chunks_deleted=deleted_count,
)
except Exception as e:
logger.error(f"Delete error: {e}")
return DeleteResult(
success=False,
chunks_deleted=0,
error=str(e),
)
async def list_collections(self, project_id: str) -> ListCollectionsResponse:
"""
List all collections for a project.
Args:
project_id: Project ID
Returns:
List of collection info
"""
collections = await self.database.list_collections(project_id)
return ListCollectionsResponse(
project_id=project_id,
collections=collections,
total_collections=len(collections),
)
async def get_collection_stats(
self,
project_id: str,
collection: str,
) -> CollectionStats:
"""
Get statistics for a collection.
Args:
project_id: Project ID
collection: Collection name
Returns:
Collection statistics
"""
return await self.database.get_collection_stats(project_id, collection)
async def update_document(
self,
project_id: str,
agent_id: str,
source_path: str,
content: str,
collection: str = "default",
chunk_type: ChunkType = ChunkType.TEXT,
file_type: FileType | None = None,
metadata: dict[str, Any] | None = None,
) -> IngestResult:
"""
Update a document by replacing existing chunks.
Deletes existing chunks for the source path and ingests new content.
Args:
project_id: Project ID
agent_id: Agent ID
source_path: Source file path
content: New content
collection: Collection name
chunk_type: Type of content
file_type: File type for code chunking
metadata: Additional metadata
Returns:
Ingest result
"""
# First delete existing chunks for this source
await self.database.delete_by_source(
project_id=project_id,
source_path=source_path,
collection=collection,
)
# Then ingest new content
request = IngestRequest(
project_id=project_id,
agent_id=agent_id,
content=content,
source_path=source_path,
collection=collection,
chunk_type=chunk_type,
file_type=file_type,
metadata=metadata or {},
)
return await self.ingest(request)
async def cleanup_expired(self) -> int:
"""
Remove expired embeddings from all collections.
Returns:
Number of embeddings removed
"""
return await self.database.cleanup_expired()
# Global collection manager instance (lazy initialization)
_collection_manager: CollectionManager | None = None
def get_collection_manager() -> CollectionManager:
"""Get the global collection manager instance."""
global _collection_manager
if _collection_manager is None:
_collection_manager = CollectionManager()
return _collection_manager
def reset_collection_manager() -> None:
"""Reset the global collection manager (for testing)."""
global _collection_manager
_collection_manager = None