fix(backend): race condition fixes for task completion and sprint operations

## Changes ### agent_instance.py - Task Completion Counter Race Condition - Changed `record_task_completion()` from read-modify-write pattern to atomic SQL UPDATE - Previously: Read instance → increment in Python memory → write back - Now: Uses `UPDATE ... SET tasks_completed = tasks_completed + 1` - Prevents lost updates when multiple concurrent task completions occur ### sprint.py - Row-Level Locking for Sprint Operations - Added `with_for_update()` to `complete_sprint()` to prevent race conditions during velocity calculation - Added `with_for_update()` to `cancel_sprint()` for consistency - Ensures atomic check-and-update for sprint status changes ## Impact These fixes prevent: - Counter metrics being lost under concurrent load - Data corruption during sprint completion - Race conditions with concurrent sprint status changes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 17:23:33 +01:00
parent 06b2491c1f
commit 7280b182bd
2 changed files with 44 additions and 12 deletions
--- a/backend/app/crud/syndarix/agent_instance.py
+++ b/backend/app/crud/syndarix/agent_instance.py
@@ -249,23 +249,35 @@ class CRUDAgentInstance(CRUDBase[AgentInstance, AgentInstanceCreate, AgentInstan
        tokens_used: int,
        cost_incurred: Decimal,
    ) -> AgentInstance | None:
-        """Record a completed task and update metrics."""
+        """Record a completed task and update metrics.
+
+        Uses atomic SQL UPDATE to prevent lost updates under concurrent load.
+        This avoids the read-modify-write race condition that occurs when
+        multiple task completions happen simultaneously.
+        """
        try:
+            now = datetime.now(UTC)
+
+            # Use atomic SQL UPDATE to increment counters without race conditions
+            # This is safe for concurrent updates - no read-modify-write pattern
            result = await db.execute(
-                select(AgentInstance).where(AgentInstance.id == instance_id)
+                update(AgentInstance)
+                .where(AgentInstance.id == instance_id)
+                .values(
+                    tasks_completed=AgentInstance.tasks_completed + 1,
+                    tokens_used=AgentInstance.tokens_used + tokens_used,
+                    cost_incurred=AgentInstance.cost_incurred + cost_incurred,
+                    last_activity_at=now,
+                    updated_at=now,
+                )
+                .returning(AgentInstance)
            )
            instance = result.scalar_one_or_none()

            if not instance:
                return None

-            instance.tasks_completed += 1
-            instance.tokens_used += tokens_used
-            instance.cost_incurred += cost_incurred
-            instance.last_activity_at = datetime.now(UTC)
-
            await db.commit()
-            await db.refresh(instance)
            return instance
        except Exception as e:
            await db.rollback()
--- a/backend/app/crud/syndarix/sprint.py
+++ b/backend/app/crud/syndarix/sprint.py
@@ -249,9 +249,18 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
        *,
        sprint_id: UUID,
    ) -> Sprint | None:
-        """Complete an active sprint and calculate completed points."""
+        """Complete an active sprint and calculate completed points.
+
+        Uses row-level locking (SELECT FOR UPDATE) to prevent race conditions
+        when velocity is being calculated and other operations might modify issues.
+        """
        try:
-            result = await db.execute(select(Sprint).where(Sprint.id == sprint_id))
+            # Lock the sprint row to prevent concurrent modifications
+            result = await db.execute(
+                select(Sprint)
+                .where(Sprint.id == sprint_id)
+                .with_for_update()
+            )
            sprint = result.scalar_one_or_none()

            if not sprint:
@@ -265,6 +274,8 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
            sprint.status = SprintStatus.COMPLETED

            # Calculate velocity (completed points) from closed issues
+            # Note: Issues are not locked, but sprint lock ensures this sprint's
+            # completion is atomic and prevents concurrent completion attempts
            points_result = await db.execute(
                select(func.sum(Issue.story_points)).where(
                    Issue.sprint_id == sprint_id,
@@ -289,9 +300,18 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
        *,
        sprint_id: UUID,
    ) -> Sprint | None:
-        """Cancel a sprint (only PLANNED or ACTIVE sprints can be cancelled)."""
+        """Cancel a sprint (only PLANNED or ACTIVE sprints can be cancelled).
+
+        Uses row-level locking to prevent race conditions with concurrent
+        sprint status modifications.
+        """
        try:
-            result = await db.execute(select(Sprint).where(Sprint.id == sprint_id))
+            # Lock the sprint row to prevent concurrent modifications
+            result = await db.execute(
+                select(Sprint)
+                .where(Sprint.id == sprint_id)
+                .with_for_update()
+            )
            sprint = result.scalar_one_or_none()

            if not sprint: