feat(context): enhance timeout handling, tenant isolation, and budget management

- Added timeout enforcement for token counting, scoring, and compression with detailed error handling. - Introduced tenant isolation in context caching using project and agent identifiers. - Enhanced budget management with stricter checks for critical context overspending and buffer limitations. - Optimized per-context locking with cleanup to prevent memory leaks in concurrent environments. - Updated default assembly timeout settings for improved performance and reliability. - Improved XML escaping in Claude adapter for safety against injection attacks. - Standardized token estimation using model-specific ratios.
2026-01-04 15:52:50 +01:00
parent 2bea057fb1
commit 1628eacf2b
10 changed files with 271 additions and 175 deletions
--- a/backend/app/services/context/compression/truncation.py
+++ b/backend/app/services/context/compression/truncation.py
@@ -19,6 +19,40 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)


+def _estimate_tokens(text: str, model: str | None = None) -> int:
+    """
+    Estimate token count using model-specific character ratios.
+
+    Module-level function for reuse across classes. Uses the same ratios
+    as TokenCalculator for consistency.
+
+    Args:
+        text: Text to estimate tokens for
+        model: Optional model name for model-specific ratios
+
+    Returns:
+        Estimated token count (minimum 1)
+    """
+    # Model-specific character ratios (chars per token)
+    model_ratios = {
+        "claude": 3.5,
+        "gpt-4": 4.0,
+        "gpt-3.5": 4.0,
+        "gemini": 4.0,
+    }
+    default_ratio = 4.0
+
+    ratio = default_ratio
+    if model:
+        model_lower = model.lower()
+        for model_prefix, model_ratio in model_ratios.items():
+            if model_prefix in model_lower:
+                ratio = model_ratio
+                break
+
+    return max(1, int(len(text) / ratio))
+
+
@dataclass
 class TruncationResult:
    """Result of truncation operation."""
@@ -284,8 +318,8 @@ class TruncationStrategy:
        if self._calculator is not None:
            return await self._calculator.count_tokens(text, model)

-        # Fallback estimation
-        return max(1, len(text) // 4)
+        # Fallback estimation with model-specific ratios
+        return _estimate_tokens(text, model)


 class ContextCompressor:
@@ -415,4 +449,5 @@ class ContextCompressor:
        """Count tokens using calculator or estimation."""
        if self._calculator is not None:
            return await self._calculator.count_tokens(text, model)
-        return max(1, len(text) // 4)
+        # Use model-specific estimation for consistency
+        return _estimate_tokens(text, model)