fix(mcp-kb): add transactional batch insert and atomic document update

- Wrap store_embeddings_batch in transaction for all-or-nothing semantics
- Add replace_source_embeddings method for atomic document updates
- Update collection_manager to use transactional replace
- Prevents race conditions and data inconsistency (closes #77)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-04 01:07:40 +01:00
parent 953af52d0e
commit cd7a9ccbdf
4 changed files with 195 additions and 50 deletions

View File

@@ -265,9 +265,10 @@ class CollectionManager:
metadata: dict[str, Any] | None = None,
) -> IngestResult:
"""
Update a document by replacing existing chunks.
Update a document by atomically replacing existing chunks.
Deletes existing chunks for the source path and ingests new content.
Uses a database transaction to delete existing chunks and insert new ones
atomically, preventing race conditions during concurrent updates.
Args:
project_id: Project ID
@@ -282,26 +283,76 @@ class CollectionManager:
Returns:
Ingest result
"""
# First delete existing chunks for this source
await self.database.delete_by_source(
project_id=project_id,
source_path=source_path,
collection=collection,
)
request_metadata = metadata or {}
# Then ingest new content
request = IngestRequest(
project_id=project_id,
agent_id=agent_id,
# Chunk the content
chunks = self.chunker_factory.chunk_content(
content=content,
source_path=source_path,
collection=collection,
chunk_type=chunk_type,
file_type=file_type,
metadata=metadata or {},
chunk_type=chunk_type,
metadata=request_metadata,
)
return await self.ingest(request)
if not chunks:
# No chunks = delete existing and return empty result
await self.database.delete_by_source(
project_id=project_id,
source_path=source_path,
collection=collection,
)
return IngestResult(
success=True,
chunks_created=0,
embeddings_generated=0,
source_path=source_path,
collection=collection,
chunk_ids=[],
)
# Generate embeddings for new chunks
chunk_texts = [chunk.content for chunk in chunks]
embeddings_list = await self.embeddings.generate_batch(
texts=chunk_texts,
project_id=project_id,
agent_id=agent_id,
)
# Build embeddings data for transactional replace
embeddings_data = []
for chunk, embedding in zip(chunks, embeddings_list, strict=True):
chunk_metadata = {
**request_metadata,
**chunk.metadata,
"token_count": chunk.token_count,
"source_path": chunk.source_path or source_path,
"start_line": chunk.start_line,
"end_line": chunk.end_line,
"file_type": (chunk.file_type or file_type).value if (chunk.file_type or file_type) else None,
}
embeddings_data.append((
chunk.content,
embedding,
chunk.chunk_type,
chunk_metadata,
))
# Atomically replace old embeddings with new ones
_, chunk_ids = await self.database.replace_source_embeddings(
project_id=project_id,
source_path=source_path,
collection=collection,
embeddings=embeddings_data,
)
return IngestResult(
success=True,
chunks_created=len(chunk_ids),
embeddings_generated=len(embeddings_list),
source_path=source_path,
collection=collection,
chunk_ids=chunk_ids,
)
async def cleanup_expired(self) -> int:
"""