feat(context): enhance performance, caching, and settings management

- Replace hard-coded limits with configurable settings (e.g., cache memory size, truncation strategy, relevance settings).
- Optimize parallel execution in token counting, scoring, and reranking for source diversity.
- Improve caching logic:
  - Add per-context locks for safe parallel scoring.
  - Reuse precomputed fingerprints for cache efficiency.
- Make truncation, scoring, and ranker behaviors fully configurable via settings.
- Add support for middle truncation, context hash-based hashing, and dynamic token limiting.
- Refactor methods for scalability and better error handling.

Tests: Updated all affected components with additional test cases.
This commit is contained in:
2026-01-04 12:37:58 +01:00
parent 6c7b72f130
commit 96e6400bd8
8 changed files with 256 additions and 86 deletions

View File

@@ -10,6 +10,7 @@ import re
from dataclasses import dataclass
from typing import TYPE_CHECKING
from ..config import ContextSettings, get_context_settings
from ..types import BaseContext, ContextType
if TYPE_CHECKING:
@@ -45,26 +46,41 @@ class TruncationStrategy:
4. Semantic chunking: Keep most relevant chunks
"""
# Default truncation marker
TRUNCATION_MARKER = "\n\n[...content truncated...]\n\n"
def __init__(
self,
calculator: "TokenCalculator | None" = None,
preserve_ratio_start: float = 0.7, # Keep 70% from start by default
min_content_length: int = 100, # Minimum characters to keep
preserve_ratio_start: float | None = None,
min_content_length: int | None = None,
settings: ContextSettings | None = None,
) -> None:
"""
Initialize truncation strategy.
Args:
calculator: Token calculator for accurate counting
preserve_ratio_start: Ratio of content to keep from start
min_content_length: Minimum characters to preserve
preserve_ratio_start: Ratio of content to keep from start (overrides settings)
min_content_length: Minimum characters to preserve (overrides settings)
settings: Context settings (uses global if None)
"""
self._settings = settings or get_context_settings()
self._calculator = calculator
self._preserve_ratio_start = preserve_ratio_start
self._min_content_length = min_content_length
# Use provided values or fall back to settings
self._preserve_ratio_start = (
preserve_ratio_start
if preserve_ratio_start is not None
else self._settings.truncation_preserve_ratio
)
self._min_content_length = (
min_content_length
if min_content_length is not None
else self._settings.truncation_min_content_length
)
@property
def TRUNCATION_MARKER(self) -> str:
"""Get truncation marker from settings."""
return self._settings.truncation_marker
def set_calculator(self, calculator: "TokenCalculator") -> None:
"""Set token calculator."""
@@ -125,7 +141,7 @@ class TruncationStrategy:
truncated_tokens=truncated_tokens,
content=truncated,
truncated=True,
truncation_ratio=1 - (truncated_tokens / original_tokens),
truncation_ratio=0.0 if original_tokens == 0 else 1 - (truncated_tokens / original_tokens),
)
async def _truncate_end(
@@ -141,10 +157,17 @@ class TruncationStrategy:
"""
# Binary search for optimal truncation point
marker_tokens = await self._count_tokens(self.TRUNCATION_MARKER, model)
available_tokens = max_tokens - marker_tokens
available_tokens = max(0, max_tokens - marker_tokens)
# Estimate characters per token
chars_per_token = len(content) / await self._count_tokens(content, model)
# Edge case: if no tokens available for content, return just the marker
if available_tokens <= 0:
return self.TRUNCATION_MARKER
# Estimate characters per token (guard against division by zero)
content_tokens = await self._count_tokens(content, model)
if content_tokens == 0:
return content + self.TRUNCATION_MARKER
chars_per_token = len(content) / content_tokens
# Start with estimated position
estimated_chars = int(available_tokens * chars_per_token)
@@ -243,7 +266,9 @@ class TruncationStrategy:
if current_tokens <= target_tokens:
return content
# Estimate characters
# Estimate characters (guard against division by zero)
if current_tokens == 0:
return content
chars_per_token = len(content) / current_tokens
estimated_chars = int(target_tokens * chars_per_token)