fix(backend): race condition fixes for task completion and sprint operations

## Changes

### agent_instance.py - Task Completion Counter Race Condition
- Changed `record_task_completion()` from read-modify-write pattern to
  atomic SQL UPDATE
- Previously: Read instance → increment in Python memory → write back
- Now: Uses `UPDATE ... SET tasks_completed = tasks_completed + 1`
- Prevents lost updates when multiple concurrent task completions occur

### sprint.py - Row-Level Locking for Sprint Operations
- Added `with_for_update()` to `complete_sprint()` to prevent race
  conditions during velocity calculation
- Added `with_for_update()` to `cancel_sprint()` for consistency
- Ensures atomic check-and-update for sprint status changes

## Impact
These fixes prevent:
- Counter metrics being lost under concurrent load
- Data corruption during sprint completion
- Race conditions with concurrent sprint status changes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-31 17:23:33 +01:00
parent 06b2491c1f
commit 7280b182bd
2 changed files with 44 additions and 12 deletions

View File

@@ -249,23 +249,35 @@ class CRUDAgentInstance(CRUDBase[AgentInstance, AgentInstanceCreate, AgentInstan
tokens_used: int,
cost_incurred: Decimal,
) -> AgentInstance | None:
"""Record a completed task and update metrics."""
"""Record a completed task and update metrics.
Uses atomic SQL UPDATE to prevent lost updates under concurrent load.
This avoids the read-modify-write race condition that occurs when
multiple task completions happen simultaneously.
"""
try:
now = datetime.now(UTC)
# Use atomic SQL UPDATE to increment counters without race conditions
# This is safe for concurrent updates - no read-modify-write pattern
result = await db.execute(
select(AgentInstance).where(AgentInstance.id == instance_id)
update(AgentInstance)
.where(AgentInstance.id == instance_id)
.values(
tasks_completed=AgentInstance.tasks_completed + 1,
tokens_used=AgentInstance.tokens_used + tokens_used,
cost_incurred=AgentInstance.cost_incurred + cost_incurred,
last_activity_at=now,
updated_at=now,
)
.returning(AgentInstance)
)
instance = result.scalar_one_or_none()
if not instance:
return None
instance.tasks_completed += 1
instance.tokens_used += tokens_used
instance.cost_incurred += cost_incurred
instance.last_activity_at = datetime.now(UTC)
await db.commit()
await db.refresh(instance)
return instance
except Exception as e:
await db.rollback()

View File

@@ -249,9 +249,18 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
*,
sprint_id: UUID,
) -> Sprint | None:
"""Complete an active sprint and calculate completed points."""
"""Complete an active sprint and calculate completed points.
Uses row-level locking (SELECT FOR UPDATE) to prevent race conditions
when velocity is being calculated and other operations might modify issues.
"""
try:
result = await db.execute(select(Sprint).where(Sprint.id == sprint_id))
# Lock the sprint row to prevent concurrent modifications
result = await db.execute(
select(Sprint)
.where(Sprint.id == sprint_id)
.with_for_update()
)
sprint = result.scalar_one_or_none()
if not sprint:
@@ -265,6 +274,8 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
sprint.status = SprintStatus.COMPLETED
# Calculate velocity (completed points) from closed issues
# Note: Issues are not locked, but sprint lock ensures this sprint's
# completion is atomic and prevents concurrent completion attempts
points_result = await db.execute(
select(func.sum(Issue.story_points)).where(
Issue.sprint_id == sprint_id,
@@ -289,9 +300,18 @@ class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
*,
sprint_id: UUID,
) -> Sprint | None:
"""Cancel a sprint (only PLANNED or ACTIVE sprints can be cancelled)."""
"""Cancel a sprint (only PLANNED or ACTIVE sprints can be cancelled).
Uses row-level locking to prevent race conditions with concurrent
sprint status modifications.
"""
try:
result = await db.execute(select(Sprint).where(Sprint.id == sprint_id))
# Lock the sprint row to prevent concurrent modifications
result = await db.execute(
select(Sprint)
.where(Sprint.id == sprint_id)
.with_for_update()
)
sprint = result.scalar_one_or_none()
if not sprint: