fix(mcp-kb): add transactional batch insert and atomic document update

- Wrap store_embeddings_batch in transaction for all-or-nothing semantics - Add replace_source_embeddings method for atomic document updates - Update collection_manager to use transactional replace - Prevents race conditions and data inconsistency (closes #77) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 01:07:40 +01:00
parent 953af52d0e
commit cd7a9ccbdf
4 changed files with 195 additions and 50 deletions
--- a/mcp-servers/knowledge-base/collection_manager.py
+++ b/mcp-servers/knowledge-base/collection_manager.py
@@ -265,9 +265,10 @@ class CollectionManager:
        metadata: dict[str, Any] | None = None,
    ) -> IngestResult:
        """
-        Update a document by replacing existing chunks.
+        Update a document by atomically replacing existing chunks.

-        Deletes existing chunks for the source path and ingests new content.
+        Uses a database transaction to delete existing chunks and insert new ones
+        atomically, preventing race conditions during concurrent updates.

        Args:
            project_id: Project ID
@@ -282,26 +283,76 @@ class CollectionManager:
        Returns:
            Ingest result
        """
-        # First delete existing chunks for this source
-        await self.database.delete_by_source(
-            project_id=project_id,
-            source_path=source_path,
-            collection=collection,
-        )
+        request_metadata = metadata or {}

-        # Then ingest new content
-        request = IngestRequest(
-            project_id=project_id,
-            agent_id=agent_id,
+        # Chunk the content
+        chunks = self.chunker_factory.chunk_content(
            content=content,
            source_path=source_path,
-            collection=collection,
-            chunk_type=chunk_type,
            file_type=file_type,
-            metadata=metadata or {},
+            chunk_type=chunk_type,
+            metadata=request_metadata,
        )

-        return await self.ingest(request)
+        if not chunks:
+            # No chunks = delete existing and return empty result
+            await self.database.delete_by_source(
+                project_id=project_id,
+                source_path=source_path,
+                collection=collection,
+            )
+            return IngestResult(
+                success=True,
+                chunks_created=0,
+                embeddings_generated=0,
+                source_path=source_path,
+                collection=collection,
+                chunk_ids=[],
+            )
+
+        # Generate embeddings for new chunks
+        chunk_texts = [chunk.content for chunk in chunks]
+        embeddings_list = await self.embeddings.generate_batch(
+            texts=chunk_texts,
+            project_id=project_id,
+            agent_id=agent_id,
+        )
+
+        # Build embeddings data for transactional replace
+        embeddings_data = []
+        for chunk, embedding in zip(chunks, embeddings_list, strict=True):
+            chunk_metadata = {
+                **request_metadata,
+                **chunk.metadata,
+                "token_count": chunk.token_count,
+                "source_path": chunk.source_path or source_path,
+                "start_line": chunk.start_line,
+                "end_line": chunk.end_line,
+                "file_type": (chunk.file_type or file_type).value if (chunk.file_type or file_type) else None,
+            }
+            embeddings_data.append((
+                chunk.content,
+                embedding,
+                chunk.chunk_type,
+                chunk_metadata,
+            ))
+
+        # Atomically replace old embeddings with new ones
+        _, chunk_ids = await self.database.replace_source_embeddings(
+            project_id=project_id,
+            source_path=source_path,
+            collection=collection,
+            embeddings=embeddings_data,
+        )
+
+        return IngestResult(
+            success=True,
+            chunks_created=len(chunk_ids),
+            embeddings_generated=len(embeddings_list),
+            source_path=source_path,
+            collection=collection,
+            chunk_ids=chunk_ids,
+        )

    async def cleanup_expired(self) -> int:
        """