feat(backend): add Phase B safety subsystems (#63)

Implements core control subsystems for the safety framework:

**Action Validation (validation/validator.py):**
- Rule-based validation engine with priority ordering
- Allow/deny/require-approval rule types
- Pattern matching for tools and resources
- Validation result caching with LRU eviction
- Emergency bypass capability with audit

**Permission System (permissions/manager.py):**
- Per-agent permission grants on resources
- Resource pattern matching (wildcards)
- Temporary permissions with expiration
- Permission inheritance hierarchy
- Default deny with configurable defaults

**Cost Control (costs/controller.py):**
- Per-session and per-day budget tracking
- Token and USD cost limits
- Warning alerts at configurable thresholds
- Budget rollover and reset policies
- Real-time usage tracking

**Rate Limiting (limits/limiter.py):**
- Sliding window rate limiter
- Per-action, per-LLM-call, per-file-op limits
- Burst allowance with recovery
- Configurable limits per operation type

**Loop Detection (loops/detector.py):**
- Exact repetition detection (same action+args)
- Semantic repetition (similar actions)
- Oscillation pattern detection (A→B→A→B)
- Per-agent action history tracking
- Loop breaking suggestions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-03 11:28:00 +01:00
parent 498c0a0e94
commit 728edd1453
10 changed files with 2020 additions and 5 deletions

View File

@@ -0,0 +1,267 @@
"""
Loop Detector
Detects and prevents action loops in agent behavior.
"""
import asyncio
import hashlib
import json
import logging
from collections import Counter, deque
from typing import Any
from ..config import get_safety_config
from ..exceptions import LoopDetectedError
from ..models import ActionRequest
logger = logging.getLogger(__name__)
class ActionSignature:
"""Signature of an action for comparison."""
def __init__(self, action: ActionRequest) -> None:
self.action_type = action.action_type.value
self.tool_name = action.tool_name
self.resource = action.resource
self.args_hash = self._hash_args(action.arguments)
def _hash_args(self, args: dict[str, Any]) -> str:
"""Create a hash of the arguments."""
try:
serialized = json.dumps(args, sort_keys=True, default=str)
return hashlib.sha256(serialized.encode()).hexdigest()[:8]
except Exception:
return ""
def exact_key(self) -> str:
"""Key for exact match detection."""
return f"{self.action_type}:{self.tool_name}:{self.resource}:{self.args_hash}"
def semantic_key(self) -> str:
"""Key for semantic (similar) match detection."""
return f"{self.action_type}:{self.tool_name}:{self.resource}"
def type_key(self) -> str:
"""Key for action type only."""
return f"{self.action_type}"
class LoopDetector:
"""
Detects action loops and repetitive behavior.
Loop Types:
- Exact: Same action with same arguments
- Semantic: Similar actions (same type/tool/resource, different args)
- Oscillation: A→B→A→B patterns
"""
def __init__(
self,
history_size: int | None = None,
max_exact_repetitions: int | None = None,
max_semantic_repetitions: int | None = None,
) -> None:
"""
Initialize the LoopDetector.
Args:
history_size: Size of action history to track
max_exact_repetitions: Max allowed exact repetitions
max_semantic_repetitions: Max allowed semantic repetitions
"""
config = get_safety_config()
self._history_size = history_size or config.loop_history_size
self._max_exact = max_exact_repetitions or config.max_repeated_actions
self._max_semantic = max_semantic_repetitions or config.max_similar_actions
# Per-agent history
self._histories: dict[str, deque[ActionSignature]] = {}
self._lock = asyncio.Lock()
async def check(self, action: ActionRequest) -> tuple[bool, str | None]:
"""
Check if an action would create a loop.
Args:
action: The action to check
Returns:
Tuple of (is_loop, loop_type)
"""
agent_id = action.metadata.agent_id
signature = ActionSignature(action)
async with self._lock:
history = self._get_history(agent_id)
# Check exact repetition
exact_key = signature.exact_key()
exact_count = sum(1 for h in history if h.exact_key() == exact_key)
if exact_count >= self._max_exact:
return True, "exact"
# Check semantic repetition
semantic_key = signature.semantic_key()
semantic_count = sum(1 for h in history if h.semantic_key() == semantic_key)
if semantic_count >= self._max_semantic:
return True, "semantic"
# Check oscillation (A→B→A→B pattern)
if len(history) >= 3:
pattern = self._detect_oscillation(history, signature)
if pattern:
return True, "oscillation"
return False, None
async def check_and_raise(self, action: ActionRequest) -> None:
"""
Check for loops and raise if detected.
Args:
action: The action to check
Raises:
LoopDetectedError: If loop is detected
"""
is_loop, loop_type = await self.check(action)
if is_loop:
signature = ActionSignature(action)
raise LoopDetectedError(
f"Loop detected: {loop_type}",
loop_type=loop_type or "unknown",
repetition_count=self._max_exact if loop_type == "exact" else self._max_semantic,
action_pattern=[signature.semantic_key()],
agent_id=action.metadata.agent_id,
action_id=action.id,
)
async def record(self, action: ActionRequest) -> None:
"""
Record an action in history.
Args:
action: The action to record
"""
agent_id = action.metadata.agent_id
signature = ActionSignature(action)
async with self._lock:
history = self._get_history(agent_id)
history.append(signature)
async def clear_history(self, agent_id: str) -> None:
"""
Clear history for an agent.
Args:
agent_id: ID of the agent
"""
async with self._lock:
if agent_id in self._histories:
self._histories[agent_id].clear()
async def get_stats(self, agent_id: str) -> dict[str, Any]:
"""
Get loop detection stats for an agent.
Args:
agent_id: ID of the agent
Returns:
Stats dictionary
"""
async with self._lock:
history = self._get_history(agent_id)
# Count action types
type_counts = Counter(h.type_key() for h in history)
semantic_counts = Counter(h.semantic_key() for h in history)
return {
"history_size": len(history),
"max_history": self._history_size,
"action_type_counts": dict(type_counts),
"top_semantic_patterns": semantic_counts.most_common(5),
}
def _get_history(self, agent_id: str) -> deque[ActionSignature]:
"""Get or create history for an agent."""
if agent_id not in self._histories:
self._histories[agent_id] = deque(maxlen=self._history_size)
return self._histories[agent_id]
def _detect_oscillation(
self,
history: deque[ActionSignature],
current: ActionSignature,
) -> bool:
"""
Detect A→B→A→B oscillation pattern.
Looks at last 4+ actions including current.
"""
if len(history) < 3:
return False
# Get last 3 actions + current
recent = [*list(history)[-3:], current]
# Check for A→B→A→B pattern
if len(recent) >= 4:
# Get semantic keys
keys = [a.semantic_key() for a in recent[-4:]]
# Pattern: k[0]==k[2] and k[1]==k[3] and k[0]!=k[1]
if keys[0] == keys[2] and keys[1] == keys[3] and keys[0] != keys[1]:
return True
return False
class LoopBreaker:
"""
Strategies for breaking detected loops.
"""
@staticmethod
async def suggest_alternatives(
action: ActionRequest,
loop_type: str,
) -> list[str]:
"""
Suggest alternative actions when loop is detected.
Args:
action: The looping action
loop_type: Type of loop detected
Returns:
List of suggestions
"""
suggestions = []
if loop_type == "exact":
suggestions.append(
"The same action with identical arguments has been repeated too many times. "
"Consider: (1) Verify the action succeeded, (2) Try a different approach, "
"(3) Escalate for human review"
)
elif loop_type == "semantic":
suggestions.append(
"Similar actions have been repeated too many times. "
"Consider: (1) Review if the approach is working, (2) Try an alternative method, "
"(3) Request clarification on the goal"
)
elif loop_type == "oscillation":
suggestions.append(
"An oscillating pattern was detected (A→B→A→B). "
"This usually indicates conflicting goals or a stuck state. "
"Consider: (1) Step back and reassess, (2) Request human guidance"
)
return suggestions