feat(backend): add Phase B safety subsystems (#63)

Implements core control subsystems for the safety framework: **Action Validation (validation/validator.py):** - Rule-based validation engine with priority ordering - Allow/deny/require-approval rule types - Pattern matching for tools and resources - Validation result caching with LRU eviction - Emergency bypass capability with audit **Permission System (permissions/manager.py):** - Per-agent permission grants on resources - Resource pattern matching (wildcards) - Temporary permissions with expiration - Permission inheritance hierarchy - Default deny with configurable defaults **Cost Control (costs/controller.py):** - Per-session and per-day budget tracking - Token and USD cost limits - Warning alerts at configurable thresholds - Budget rollover and reset policies - Real-time usage tracking **Rate Limiting (limits/limiter.py):** - Sliding window rate limiter - Per-action, per-LLM-call, per-file-op limits - Burst allowance with recovery - Configurable limits per operation type **Loop Detection (loops/detector.py):** - Exact repetition detection (same action+args) - Semantic repetition (similar actions) - Oscillation pattern detection (A→B→A→B) - Per-agent action history tracking - Loop breaking suggestions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 11:28:00 +01:00
parent 498c0a0e94
commit 728edd1453
10 changed files with 2020 additions and 5 deletions
--- a/backend/app/services/safety/loops/detector.py
+++ b/backend/app/services/safety/loops/detector.py
@@ -0,0 +1,267 @@
+"""
+Loop Detector
+
+Detects and prevents action loops in agent behavior.
+"""
+
+import asyncio
+import hashlib
+import json
+import logging
+from collections import Counter, deque
+from typing import Any
+
+from ..config import get_safety_config
+from ..exceptions import LoopDetectedError
+from ..models import ActionRequest
+
+logger = logging.getLogger(__name__)
+
+
+class ActionSignature:
+    """Signature of an action for comparison."""
+
+    def __init__(self, action: ActionRequest) -> None:
+        self.action_type = action.action_type.value
+        self.tool_name = action.tool_name
+        self.resource = action.resource
+        self.args_hash = self._hash_args(action.arguments)
+
+    def _hash_args(self, args: dict[str, Any]) -> str:
+        """Create a hash of the arguments."""
+        try:
+            serialized = json.dumps(args, sort_keys=True, default=str)
+            return hashlib.sha256(serialized.encode()).hexdigest()[:8]
+        except Exception:
+            return ""
+
+    def exact_key(self) -> str:
+        """Key for exact match detection."""
+        return f"{self.action_type}:{self.tool_name}:{self.resource}:{self.args_hash}"
+
+    def semantic_key(self) -> str:
+        """Key for semantic (similar) match detection."""
+        return f"{self.action_type}:{self.tool_name}:{self.resource}"
+
+    def type_key(self) -> str:
+        """Key for action type only."""
+        return f"{self.action_type}"
+
+
+class LoopDetector:
+    """
+    Detects action loops and repetitive behavior.
+
+    Loop Types:
+    - Exact: Same action with same arguments
+    - Semantic: Similar actions (same type/tool/resource, different args)
+    - Oscillation: A→B→A→B patterns
+    """
+
+    def __init__(
+        self,
+        history_size: int | None = None,
+        max_exact_repetitions: int | None = None,
+        max_semantic_repetitions: int | None = None,
+    ) -> None:
+        """
+        Initialize the LoopDetector.
+
+        Args:
+            history_size: Size of action history to track
+            max_exact_repetitions: Max allowed exact repetitions
+            max_semantic_repetitions: Max allowed semantic repetitions
+        """
+        config = get_safety_config()
+
+        self._history_size = history_size or config.loop_history_size
+        self._max_exact = max_exact_repetitions or config.max_repeated_actions
+        self._max_semantic = max_semantic_repetitions or config.max_similar_actions
+
+        # Per-agent history
+        self._histories: dict[str, deque[ActionSignature]] = {}
+        self._lock = asyncio.Lock()
+
+    async def check(self, action: ActionRequest) -> tuple[bool, str | None]:
+        """
+        Check if an action would create a loop.
+
+        Args:
+            action: The action to check
+
+        Returns:
+            Tuple of (is_loop, loop_type)
+        """
+        agent_id = action.metadata.agent_id
+        signature = ActionSignature(action)
+
+        async with self._lock:
+            history = self._get_history(agent_id)
+
+            # Check exact repetition
+            exact_key = signature.exact_key()
+            exact_count = sum(1 for h in history if h.exact_key() == exact_key)
+            if exact_count >= self._max_exact:
+                return True, "exact"
+
+            # Check semantic repetition
+            semantic_key = signature.semantic_key()
+            semantic_count = sum(1 for h in history if h.semantic_key() == semantic_key)
+            if semantic_count >= self._max_semantic:
+                return True, "semantic"
+
+            # Check oscillation (A→B→A→B pattern)
+            if len(history) >= 3:
+                pattern = self._detect_oscillation(history, signature)
+                if pattern:
+                    return True, "oscillation"
+
+        return False, None
+
+    async def check_and_raise(self, action: ActionRequest) -> None:
+        """
+        Check for loops and raise if detected.
+
+        Args:
+            action: The action to check
+
+        Raises:
+            LoopDetectedError: If loop is detected
+        """
+        is_loop, loop_type = await self.check(action)
+        if is_loop:
+            signature = ActionSignature(action)
+            raise LoopDetectedError(
+                f"Loop detected: {loop_type}",
+                loop_type=loop_type or "unknown",
+                repetition_count=self._max_exact if loop_type == "exact" else self._max_semantic,
+                action_pattern=[signature.semantic_key()],
+                agent_id=action.metadata.agent_id,
+                action_id=action.id,
+            )
+
+    async def record(self, action: ActionRequest) -> None:
+        """
+        Record an action in history.
+
+        Args:
+            action: The action to record
+        """
+        agent_id = action.metadata.agent_id
+        signature = ActionSignature(action)
+
+        async with self._lock:
+            history = self._get_history(agent_id)
+            history.append(signature)
+
+    async def clear_history(self, agent_id: str) -> None:
+        """
+        Clear history for an agent.
+
+        Args:
+            agent_id: ID of the agent
+        """
+        async with self._lock:
+            if agent_id in self._histories:
+                self._histories[agent_id].clear()
+
+    async def get_stats(self, agent_id: str) -> dict[str, Any]:
+        """
+        Get loop detection stats for an agent.
+
+        Args:
+            agent_id: ID of the agent
+
+        Returns:
+            Stats dictionary
+        """
+        async with self._lock:
+            history = self._get_history(agent_id)
+
+            # Count action types
+            type_counts = Counter(h.type_key() for h in history)
+            semantic_counts = Counter(h.semantic_key() for h in history)
+
+            return {
+                "history_size": len(history),
+                "max_history": self._history_size,
+                "action_type_counts": dict(type_counts),
+                "top_semantic_patterns": semantic_counts.most_common(5),
+            }
+
+    def _get_history(self, agent_id: str) -> deque[ActionSignature]:
+        """Get or create history for an agent."""
+        if agent_id not in self._histories:
+            self._histories[agent_id] = deque(maxlen=self._history_size)
+        return self._histories[agent_id]
+
+    def _detect_oscillation(
+        self,
+        history: deque[ActionSignature],
+        current: ActionSignature,
+    ) -> bool:
+        """
+        Detect A→B→A→B oscillation pattern.
+
+        Looks at last 4+ actions including current.
+        """
+        if len(history) < 3:
+            return False
+
+        # Get last 3 actions + current
+        recent = [*list(history)[-3:], current]
+
+        # Check for A→B→A→B pattern
+        if len(recent) >= 4:
+            # Get semantic keys
+            keys = [a.semantic_key() for a in recent[-4:]]
+
+            # Pattern: k[0]==k[2] and k[1]==k[3] and k[0]!=k[1]
+            if keys[0] == keys[2] and keys[1] == keys[3] and keys[0] != keys[1]:
+                return True
+
+        return False
+
+
+class LoopBreaker:
+    """
+    Strategies for breaking detected loops.
+    """
+
+    @staticmethod
+    async def suggest_alternatives(
+        action: ActionRequest,
+        loop_type: str,
+    ) -> list[str]:
+        """
+        Suggest alternative actions when loop is detected.
+
+        Args:
+            action: The looping action
+            loop_type: Type of loop detected
+
+        Returns:
+            List of suggestions
+        """
+        suggestions = []
+
+        if loop_type == "exact":
+            suggestions.append(
+                "The same action with identical arguments has been repeated too many times. "
+                "Consider: (1) Verify the action succeeded, (2) Try a different approach, "
+                "(3) Escalate for human review"
+            )
+        elif loop_type == "semantic":
+            suggestions.append(
+                "Similar actions have been repeated too many times. "
+                "Consider: (1) Review if the approach is working, (2) Try an alternative method, "
+                "(3) Request clarification on the goal"
+            )
+        elif loop_type == "oscillation":
+            suggestions.append(
+                "An oscillating pattern was detected (A→B→A→B). "
+                "This usually indicates conflicting goals or a stuck state. "
+                "Consider: (1) Step back and reassess, (2) Request human guidance"
+            )
+
+        return suggestions