fix(mcp-kb): add transactional batch insert and atomic document update

- Wrap store_embeddings_batch in transaction for all-or-nothing semantics - Add replace_source_embeddings method for atomic document updates - Update collection_manager to use transactional replace - Prevents race conditions and data inconsistency (closes #77) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 01:07:40 +01:00
parent 953af52d0e
commit cd7a9ccbdf
4 changed files with 195 additions and 50 deletions
--- a/mcp-servers/knowledge-base/database.py
+++ b/mcp-servers/knowledge-base/database.py
@@ -285,38 +285,40 @@ class DatabaseManager:

        try:
            async with self.acquire() as conn:
-                for project_id, collection, content, embedding, chunk_type, metadata in embeddings:
-                    content_hash = self.compute_content_hash(content)
-                    source_path = metadata.get("source_path")
-                    start_line = metadata.get("start_line")
-                    end_line = metadata.get("end_line")
-                    file_type = metadata.get("file_type")
+                # Wrap in transaction for all-or-nothing batch semantics
+                async with conn.transaction():
+                    for project_id, collection, content, embedding, chunk_type, metadata in embeddings:
+                        content_hash = self.compute_content_hash(content)
+                        source_path = metadata.get("source_path")
+                        start_line = metadata.get("start_line")
+                        end_line = metadata.get("end_line")
+                        file_type = metadata.get("file_type")

-                    embedding_id = await conn.fetchval(
-                        """
-                        INSERT INTO knowledge_embeddings
-                        (project_id, collection, content, embedding, chunk_type,
-                         source_path, start_line, end_line, file_type, metadata,
-                         content_hash, expires_at)
-                        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
-                        ON CONFLICT DO NOTHING
-                        RETURNING id
-                        """,
-                        project_id,
-                        collection,
-                        content,
-                        embedding,
-                        chunk_type.value,
-                        source_path,
-                        start_line,
-                        end_line,
-                        file_type,
-                        metadata,
-                        content_hash,
-                        expires_at,
-                    )
-                    if embedding_id:
-                        ids.append(str(embedding_id))
+                        embedding_id = await conn.fetchval(
+                            """
+                            INSERT INTO knowledge_embeddings
+                            (project_id, collection, content, embedding, chunk_type,
+                             source_path, start_line, end_line, file_type, metadata,
+                             content_hash, expires_at)
+                            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
+                            ON CONFLICT DO NOTHING
+                            RETURNING id
+                            """,
+                            project_id,
+                            collection,
+                            content,
+                            embedding,
+                            chunk_type.value,
+                            source_path,
+                            start_line,
+                            end_line,
+                            file_type,
+                            metadata,
+                            content_hash,
+                            expires_at,
+                        )
+                        if embedding_id:
+                            ids.append(str(embedding_id))

            logger.info(f"Stored {len(ids)} embeddings in batch")
            return ids
@@ -535,6 +537,96 @@ class DatabaseManager:
                cause=e,
            )

+    async def replace_source_embeddings(
+        self,
+        project_id: str,
+        source_path: str,
+        collection: str,
+        embeddings: list[tuple[str, list[float], ChunkType, dict[str, Any]]],
+    ) -> tuple[int, list[str]]:
+        """
+        Atomically replace all embeddings for a source path.
+
+        Deletes existing embeddings and inserts new ones in a single transaction,
+        preventing race conditions during document updates.
+
+        Args:
+            project_id: Project ID
+            source_path: Source file path being updated
+            collection: Collection name
+            embeddings: List of (content, embedding, chunk_type, metadata)
+
+        Returns:
+            Tuple of (deleted_count, new_embedding_ids)
+        """
+        expires_at = None
+        if self._settings.embedding_ttl_days > 0:
+            expires_at = datetime.now(UTC) + timedelta(
+                days=self._settings.embedding_ttl_days
+            )
+
+        try:
+            async with self.acquire() as conn:
+                # Use transaction for atomic replace
+                async with conn.transaction():
+                    # First, delete existing embeddings for this source
+                    delete_result = await conn.execute(
+                        """
+                        DELETE FROM knowledge_embeddings
+                        WHERE project_id = $1 AND source_path = $2 AND collection = $3
+                        """,
+                        project_id,
+                        source_path,
+                        collection,
+                    )
+                    deleted_count = int(delete_result.split()[-1])
+
+                    # Then insert new embeddings
+                    new_ids = []
+                    for content, embedding, chunk_type, metadata in embeddings:
+                        content_hash = self.compute_content_hash(content)
+                        start_line = metadata.get("start_line")
+                        end_line = metadata.get("end_line")
+                        file_type = metadata.get("file_type")
+
+                        embedding_id = await conn.fetchval(
+                            """
+                            INSERT INTO knowledge_embeddings
+                            (project_id, collection, content, embedding, chunk_type,
+                             source_path, start_line, end_line, file_type, metadata,
+                             content_hash, expires_at)
+                            VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
+                            RETURNING id
+                            """,
+                            project_id,
+                            collection,
+                            content,
+                            embedding,
+                            chunk_type.value,
+                            source_path,
+                            start_line,
+                            end_line,
+                            file_type,
+                            metadata,
+                            content_hash,
+                            expires_at,
+                        )
+                        if embedding_id:
+                            new_ids.append(str(embedding_id))
+
+                    logger.info(
+                        f"Replaced source {source_path}: deleted {deleted_count}, "
+                        f"inserted {len(new_ids)} embeddings"
+                    )
+                    return deleted_count, new_ids
+
+        except asyncpg.PostgresError as e:
+            logger.error(f"Replace source error: {e}")
+            raise DatabaseQueryError(
+                message=f"Failed to replace source embeddings: {e}",
+                cause=e,
+            )
+
    async def delete_collection(
        self,
        project_id: str,