Files
fast-next-template/backend/app/services/context/config.py
Felipe Cardoso 1628eacf2b feat(context): enhance timeout handling, tenant isolation, and budget management
- Added timeout enforcement for token counting, scoring, and compression with detailed error handling.
- Introduced tenant isolation in context caching using project and agent identifiers.
- Enhanced budget management with stricter checks for critical context overspending and buffer limitations.
- Optimized per-context locking with cleanup to prevent memory leaks in concurrent environments.
- Updated default assembly timeout settings for improved performance and reliability.
- Improved XML escaping in Claude adapter for safety against injection attacks.
- Standardized token estimation using model-specific ratios.
2026-01-04 15:52:50 +01:00

381 lines
12 KiB
Python

"""
Context Management Engine Configuration.
Provides Pydantic settings for context assembly,
token budget allocation, and caching.
"""
import threading
from functools import lru_cache
from typing import Any
from pydantic import Field, field_validator, model_validator
from pydantic_settings import BaseSettings
class ContextSettings(BaseSettings):
"""
Configuration for the Context Management Engine.
All settings can be overridden via environment variables
with the CTX_ prefix.
"""
# Budget allocation percentages (must sum to 1.0)
budget_system: float = Field(
default=0.05,
ge=0.0,
le=1.0,
description="Percentage of budget for system prompts (5%)",
)
budget_task: float = Field(
default=0.10,
ge=0.0,
le=1.0,
description="Percentage of budget for task context (10%)",
)
budget_knowledge: float = Field(
default=0.40,
ge=0.0,
le=1.0,
description="Percentage of budget for RAG/knowledge (40%)",
)
budget_conversation: float = Field(
default=0.20,
ge=0.0,
le=1.0,
description="Percentage of budget for conversation history (20%)",
)
budget_tools: float = Field(
default=0.05,
ge=0.0,
le=1.0,
description="Percentage of budget for tool descriptions (5%)",
)
budget_response: float = Field(
default=0.15,
ge=0.0,
le=1.0,
description="Percentage reserved for response (15%)",
)
budget_buffer: float = Field(
default=0.05,
ge=0.0,
le=1.0,
description="Percentage buffer for safety margin (5%)",
)
# Scoring weights
scoring_relevance_weight: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Weight for relevance scoring",
)
scoring_recency_weight: float = Field(
default=0.3,
ge=0.0,
le=1.0,
description="Weight for recency scoring",
)
scoring_priority_weight: float = Field(
default=0.2,
ge=0.0,
le=1.0,
description="Weight for priority scoring",
)
# Recency decay settings
recency_decay_hours: float = Field(
default=24.0,
gt=0.0,
description="Hours until recency score decays to 50%",
)
recency_max_age_hours: float = Field(
default=168.0,
gt=0.0,
description="Hours until context is considered stale (7 days)",
)
# Compression settings
compression_threshold: float = Field(
default=0.8,
ge=0.0,
le=1.0,
description="Compress when budget usage exceeds this percentage",
)
truncation_marker: str = Field(
default="\n\n[...content truncated...]\n\n",
description="Marker text to insert where content was truncated",
)
truncation_preserve_ratio: float = Field(
default=0.7,
ge=0.1,
le=0.9,
description="Ratio of content to preserve from start in middle truncation (0.7 = 70% start, 30% end)",
)
truncation_min_content_length: int = Field(
default=100,
ge=10,
le=1000,
description="Minimum content length in characters before truncation applies",
)
summary_model_group: str = Field(
default="fast",
description="Model group to use for summarization",
)
# Caching settings
cache_enabled: bool = Field(
default=True,
description="Enable Redis caching for assembled contexts",
)
cache_ttl_seconds: int = Field(
default=3600,
ge=60,
le=86400,
description="Cache TTL in seconds (1 hour default, max 24 hours)",
)
cache_prefix: str = Field(
default="ctx",
description="Redis key prefix for context cache",
)
cache_memory_max_items: int = Field(
default=1000,
ge=100,
le=100000,
description="Maximum items in memory fallback cache when Redis unavailable",
)
# Performance settings
max_assembly_time_ms: int = Field(
default=2000,
ge=10,
le=30000,
description="Maximum time for context assembly in milliseconds. "
"Should be high enough to accommodate MCP calls for knowledge retrieval.",
)
parallel_scoring: bool = Field(
default=True,
description="Score contexts in parallel for better performance",
)
max_parallel_scores: int = Field(
default=10,
ge=1,
le=50,
description="Maximum number of contexts to score in parallel",
)
# Knowledge retrieval settings
knowledge_search_type: str = Field(
default="hybrid",
description="Default search type for knowledge retrieval",
)
knowledge_max_results: int = Field(
default=10,
ge=1,
le=50,
description="Maximum knowledge chunks to retrieve",
)
knowledge_min_score: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Minimum relevance score for knowledge",
)
# Relevance scoring settings
relevance_keyword_fallback_weight: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Maximum score for keyword-based fallback scoring (when semantic unavailable)",
)
relevance_semantic_max_chars: int = Field(
default=2000,
ge=100,
le=10000,
description="Maximum content length in chars for semantic similarity computation",
)
# Diversity/ranking settings
diversity_max_per_source: int = Field(
default=3,
ge=1,
le=20,
description="Maximum contexts from the same source in diversity reranking",
)
# Conversation history settings
conversation_max_turns: int = Field(
default=20,
ge=1,
le=100,
description="Maximum conversation turns to include",
)
conversation_recent_priority: bool = Field(
default=True,
description="Prioritize recent conversation turns",
)
@field_validator("knowledge_search_type")
@classmethod
def validate_search_type(cls, v: str) -> str:
"""Validate search type is valid."""
valid_types = {"semantic", "keyword", "hybrid"}
if v not in valid_types:
raise ValueError(f"search_type must be one of: {valid_types}")
return v
@model_validator(mode="after")
def validate_budget_allocation(self) -> "ContextSettings":
"""Validate that budget percentages sum to 1.0."""
total = (
self.budget_system
+ self.budget_task
+ self.budget_knowledge
+ self.budget_conversation
+ self.budget_tools
+ self.budget_response
+ self.budget_buffer
)
# Allow small floating point error
if abs(total - 1.0) > 0.001:
raise ValueError(
f"Budget percentages must sum to 1.0, got {total:.3f}. "
f"Current allocation: system={self.budget_system}, task={self.budget_task}, "
f"knowledge={self.budget_knowledge}, conversation={self.budget_conversation}, "
f"tools={self.budget_tools}, response={self.budget_response}, buffer={self.budget_buffer}"
)
return self
@model_validator(mode="after")
def validate_scoring_weights(self) -> "ContextSettings":
"""Validate that scoring weights sum to 1.0."""
total = (
self.scoring_relevance_weight
+ self.scoring_recency_weight
+ self.scoring_priority_weight
)
# Allow small floating point error
if abs(total - 1.0) > 0.001:
raise ValueError(
f"Scoring weights must sum to 1.0, got {total:.3f}. "
f"Current weights: relevance={self.scoring_relevance_weight}, "
f"recency={self.scoring_recency_weight}, priority={self.scoring_priority_weight}"
)
return self
def get_budget_allocation(self) -> dict[str, float]:
"""Get budget allocation as a dictionary."""
return {
"system": self.budget_system,
"task": self.budget_task,
"knowledge": self.budget_knowledge,
"conversation": self.budget_conversation,
"tools": self.budget_tools,
"response": self.budget_response,
"buffer": self.budget_buffer,
}
def get_scoring_weights(self) -> dict[str, float]:
"""Get scoring weights as a dictionary."""
return {
"relevance": self.scoring_relevance_weight,
"recency": self.scoring_recency_weight,
"priority": self.scoring_priority_weight,
}
def to_dict(self) -> dict[str, Any]:
"""Convert settings to dictionary for logging/debugging."""
return {
"budget": self.get_budget_allocation(),
"scoring": self.get_scoring_weights(),
"compression": {
"threshold": self.compression_threshold,
"summary_model_group": self.summary_model_group,
"truncation_marker": self.truncation_marker,
"truncation_preserve_ratio": self.truncation_preserve_ratio,
"truncation_min_content_length": self.truncation_min_content_length,
},
"cache": {
"enabled": self.cache_enabled,
"ttl_seconds": self.cache_ttl_seconds,
"prefix": self.cache_prefix,
"memory_max_items": self.cache_memory_max_items,
},
"performance": {
"max_assembly_time_ms": self.max_assembly_time_ms,
"parallel_scoring": self.parallel_scoring,
"max_parallel_scores": self.max_parallel_scores,
},
"knowledge": {
"search_type": self.knowledge_search_type,
"max_results": self.knowledge_max_results,
"min_score": self.knowledge_min_score,
},
"relevance": {
"keyword_fallback_weight": self.relevance_keyword_fallback_weight,
"semantic_max_chars": self.relevance_semantic_max_chars,
},
"diversity": {
"max_per_source": self.diversity_max_per_source,
},
"conversation": {
"max_turns": self.conversation_max_turns,
"recent_priority": self.conversation_recent_priority,
},
}
model_config = {
"env_prefix": "CTX_",
"env_file": "../.env",
"env_file_encoding": "utf-8",
"case_sensitive": False,
"extra": "ignore",
}
# Thread-safe singleton pattern
_settings: ContextSettings | None = None
_settings_lock = threading.Lock()
def get_context_settings() -> ContextSettings:
"""
Get the global ContextSettings instance.
Thread-safe with double-checked locking pattern.
Returns:
ContextSettings instance
"""
global _settings
if _settings is None:
with _settings_lock:
if _settings is None:
_settings = ContextSettings()
return _settings
def reset_context_settings() -> None:
"""
Reset the global settings instance.
Primarily used for testing.
"""
global _settings
with _settings_lock:
_settings = None
@lru_cache(maxsize=1)
def get_default_settings() -> ContextSettings:
"""
Get default settings (cached).
Use this for read-only access to defaults.
For mutable access, use get_context_settings().
"""
return ContextSettings()