Files
syndarix/backend/tests/services/context/test_scoring.py
Felipe Cardoso 0d2005ddcb feat(context): implement context scoring and ranking (Phase 3)
Add comprehensive scoring system with three strategies:
- RelevanceScorer: Semantic similarity with keyword fallback
- RecencyScorer: Exponential decay with type-specific half-lives
- PriorityScorer: Priority-based scoring with type bonuses

Implement CompositeScorer combining all strategies with configurable
weights (default: 50% relevance, 30% recency, 20% priority).

Add ContextRanker for budget-aware context selection with:
- Greedy selection algorithm respecting token budgets
- CRITICAL priority contexts always included
- Diversity reranking to prevent source dominance
- Comprehensive selection statistics

68 tests covering all scoring and ranking functionality.

Part of #61 - Context Management Engine

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 02:24:06 +01:00

713 lines
22 KiB
Python

"""Tests for context scoring module."""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.services.context.scoring import (
BaseScorer,
CompositeScorer,
PriorityScorer,
RecencyScorer,
RelevanceScorer,
ScoredContext,
)
from app.services.context.types import (
ContextPriority,
ContextType,
ConversationContext,
KnowledgeContext,
MessageRole,
SystemContext,
TaskContext,
)
class TestRelevanceScorer:
"""Tests for RelevanceScorer."""
def test_creation(self) -> None:
"""Test scorer creation."""
scorer = RelevanceScorer()
assert scorer.weight == 1.0
def test_creation_with_weight(self) -> None:
"""Test scorer creation with custom weight."""
scorer = RelevanceScorer(weight=0.5)
assert scorer.weight == 0.5
@pytest.mark.asyncio
async def test_score_with_precomputed_relevance(self) -> None:
"""Test scoring with pre-computed relevance score."""
scorer = RelevanceScorer()
# KnowledgeContext with pre-computed score
context = KnowledgeContext(
content="Test content about Python",
source="docs/python.md",
relevance_score=0.85,
)
score = await scorer.score(context, "Python programming")
assert score == 0.85
@pytest.mark.asyncio
async def test_score_with_metadata_score(self) -> None:
"""Test scoring with metadata-provided score."""
scorer = RelevanceScorer()
context = SystemContext(
content="System prompt",
source="system",
metadata={"relevance_score": 0.9},
)
score = await scorer.score(context, "anything")
assert score == 0.9
@pytest.mark.asyncio
async def test_score_fallback_to_keyword_matching(self) -> None:
"""Test fallback to keyword matching when no score available."""
scorer = RelevanceScorer(keyword_fallback_weight=0.5)
context = TaskContext(
content="Implement authentication with JWT tokens",
source="task",
)
# Query has matching keywords
score = await scorer.score(context, "JWT authentication")
assert score > 0
@pytest.mark.asyncio
async def test_keyword_matching_no_overlap(self) -> None:
"""Test keyword matching with no query overlap."""
scorer = RelevanceScorer()
context = TaskContext(
content="Implement database migration",
source="task",
)
score = await scorer.score(context, "xyz abc 123")
assert score == 0.0
@pytest.mark.asyncio
async def test_keyword_matching_full_overlap(self) -> None:
"""Test keyword matching with high overlap."""
scorer = RelevanceScorer(keyword_fallback_weight=1.0)
context = TaskContext(
content="python programming language",
source="task",
)
score = await scorer.score(context, "python programming")
# Should have high score due to keyword overlap
assert score > 0.5
@pytest.mark.asyncio
async def test_score_with_mcp_success(self) -> None:
"""Test scoring with MCP semantic similarity."""
mock_mcp = MagicMock()
mock_result = MagicMock()
mock_result.success = True
mock_result.data = {"similarity": 0.75}
mock_mcp.call_tool = AsyncMock(return_value=mock_result)
scorer = RelevanceScorer(mcp_manager=mock_mcp)
context = TaskContext(
content="Test content",
source="task",
)
score = await scorer.score(context, "test query")
assert score == 0.75
@pytest.mark.asyncio
async def test_score_with_mcp_failure_fallback(self) -> None:
"""Test fallback when MCP fails."""
mock_mcp = MagicMock()
mock_mcp.call_tool = AsyncMock(side_effect=Exception("Connection failed"))
scorer = RelevanceScorer(mcp_manager=mock_mcp, keyword_fallback_weight=0.5)
context = TaskContext(
content="Python programming code",
source="task",
)
# Should fall back to keyword matching
score = await scorer.score(context, "Python code")
assert score > 0
@pytest.mark.asyncio
async def test_score_batch(self) -> None:
"""Test batch scoring."""
scorer = RelevanceScorer()
contexts = [
KnowledgeContext(
content="Python", source="1", relevance_score=0.8
),
KnowledgeContext(
content="Java", source="2", relevance_score=0.6
),
KnowledgeContext(
content="Go", source="3", relevance_score=0.9
),
]
scores = await scorer.score_batch(contexts, "test")
assert len(scores) == 3
assert scores[0] == 0.8
assert scores[1] == 0.6
assert scores[2] == 0.9
def test_set_mcp_manager(self) -> None:
"""Test setting MCP manager."""
scorer = RelevanceScorer()
assert scorer._mcp is None
mock_mcp = MagicMock()
scorer.set_mcp_manager(mock_mcp)
assert scorer._mcp is mock_mcp
class TestRecencyScorer:
"""Tests for RecencyScorer."""
def test_creation(self) -> None:
"""Test scorer creation."""
scorer = RecencyScorer()
assert scorer.weight == 1.0
assert scorer._half_life_hours == 24.0
def test_creation_with_custom_half_life(self) -> None:
"""Test scorer creation with custom half-life."""
scorer = RecencyScorer(half_life_hours=12.0)
assert scorer._half_life_hours == 12.0
@pytest.mark.asyncio
async def test_score_recent_context(self) -> None:
"""Test scoring a very recent context."""
scorer = RecencyScorer(half_life_hours=24.0)
now = datetime.now(UTC)
context = TaskContext(
content="Recent task",
source="task",
timestamp=now,
)
score = await scorer.score(context, "query", reference_time=now)
# Very recent should have score near 1.0
assert score > 0.99
@pytest.mark.asyncio
async def test_score_at_half_life(self) -> None:
"""Test scoring at exactly half-life age."""
scorer = RecencyScorer(half_life_hours=24.0)
now = datetime.now(UTC)
half_life_ago = now - timedelta(hours=24)
context = TaskContext(
content="Day old task",
source="task",
timestamp=half_life_ago,
)
score = await scorer.score(context, "query", reference_time=now)
# At half-life, score should be ~0.5
assert 0.49 <= score <= 0.51
@pytest.mark.asyncio
async def test_score_old_context(self) -> None:
"""Test scoring a very old context."""
scorer = RecencyScorer(half_life_hours=24.0)
now = datetime.now(UTC)
week_ago = now - timedelta(days=7)
context = TaskContext(
content="Week old task",
source="task",
timestamp=week_ago,
)
score = await scorer.score(context, "query", reference_time=now)
# 7 days with 24h half-life = very low score
assert score < 0.01
@pytest.mark.asyncio
async def test_type_specific_half_lives(self) -> None:
"""Test that different context types have different half-lives."""
scorer = RecencyScorer()
now = datetime.now(UTC)
one_hour_ago = now - timedelta(hours=1)
# Conversation has 1 hour half-life by default
conv_context = ConversationContext(
content="Hello",
source="chat",
role=MessageRole.USER,
timestamp=one_hour_ago,
)
# Knowledge has 168 hour (1 week) half-life by default
knowledge_context = KnowledgeContext(
content="Documentation",
source="docs",
timestamp=one_hour_ago,
)
conv_score = await scorer.score(conv_context, "query", reference_time=now)
knowledge_score = await scorer.score(knowledge_context, "query", reference_time=now)
# Conversation should decay much faster
assert conv_score < knowledge_score
def test_get_half_life(self) -> None:
"""Test getting half-life for context type."""
scorer = RecencyScorer()
assert scorer.get_half_life(ContextType.CONVERSATION) == 1.0
assert scorer.get_half_life(ContextType.KNOWLEDGE) == 168.0
assert scorer.get_half_life(ContextType.SYSTEM) == 720.0
def test_set_half_life(self) -> None:
"""Test setting custom half-life."""
scorer = RecencyScorer()
scorer.set_half_life(ContextType.TASK, 48.0)
assert scorer.get_half_life(ContextType.TASK) == 48.0
def test_set_half_life_invalid(self) -> None:
"""Test setting invalid half-life."""
scorer = RecencyScorer()
with pytest.raises(ValueError):
scorer.set_half_life(ContextType.TASK, 0)
with pytest.raises(ValueError):
scorer.set_half_life(ContextType.TASK, -1)
@pytest.mark.asyncio
async def test_score_batch(self) -> None:
"""Test batch scoring."""
scorer = RecencyScorer()
now = datetime.now(UTC)
contexts = [
TaskContext(content="1", source="t", timestamp=now),
TaskContext(
content="2", source="t", timestamp=now - timedelta(hours=24)
),
TaskContext(
content="3", source="t", timestamp=now - timedelta(hours=48)
),
]
scores = await scorer.score_batch(contexts, "query", reference_time=now)
assert len(scores) == 3
# Scores should be in descending order (more recent = higher)
assert scores[0] > scores[1] > scores[2]
class TestPriorityScorer:
"""Tests for PriorityScorer."""
def test_creation(self) -> None:
"""Test scorer creation."""
scorer = PriorityScorer()
assert scorer.weight == 1.0
@pytest.mark.asyncio
async def test_score_critical_priority(self) -> None:
"""Test scoring CRITICAL priority context."""
scorer = PriorityScorer()
context = SystemContext(
content="Critical system prompt",
source="system",
priority=ContextPriority.CRITICAL.value,
)
score = await scorer.score(context, "query")
# CRITICAL (100) + type bonus should be > 1.0, normalized to 1.0
assert score == 1.0
@pytest.mark.asyncio
async def test_score_normal_priority(self) -> None:
"""Test scoring NORMAL priority context."""
scorer = PriorityScorer()
context = TaskContext(
content="Normal task",
source="task",
priority=ContextPriority.NORMAL.value,
)
score = await scorer.score(context, "query")
# NORMAL (50) = 0.5, plus TASK bonus (0.15) = 0.65
assert 0.6 <= score <= 0.7
@pytest.mark.asyncio
async def test_score_low_priority(self) -> None:
"""Test scoring LOW priority context."""
scorer = PriorityScorer()
context = KnowledgeContext(
content="Low priority knowledge",
source="docs",
priority=ContextPriority.LOW.value,
)
score = await scorer.score(context, "query")
# LOW (20) = 0.2, no bonus for KNOWLEDGE
assert 0.15 <= score <= 0.25
@pytest.mark.asyncio
async def test_type_bonuses(self) -> None:
"""Test type-specific priority bonuses."""
scorer = PriorityScorer()
# All with same base priority
system_ctx = SystemContext(
content="System",
source="system",
priority=50,
)
task_ctx = TaskContext(
content="Task",
source="task",
priority=50,
)
knowledge_ctx = KnowledgeContext(
content="Knowledge",
source="docs",
priority=50,
)
system_score = await scorer.score(system_ctx, "query")
task_score = await scorer.score(task_ctx, "query")
knowledge_score = await scorer.score(knowledge_ctx, "query")
# System has highest bonus (0.2), task next (0.15), knowledge has none
assert system_score > task_score > knowledge_score
def test_get_type_bonus(self) -> None:
"""Test getting type bonus."""
scorer = PriorityScorer()
assert scorer.get_type_bonus(ContextType.SYSTEM) == 0.2
assert scorer.get_type_bonus(ContextType.TASK) == 0.15
assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.0
def test_set_type_bonus(self) -> None:
"""Test setting custom type bonus."""
scorer = PriorityScorer()
scorer.set_type_bonus(ContextType.KNOWLEDGE, 0.1)
assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.1
def test_set_type_bonus_invalid(self) -> None:
"""Test setting invalid type bonus."""
scorer = PriorityScorer()
with pytest.raises(ValueError):
scorer.set_type_bonus(ContextType.KNOWLEDGE, 1.5)
with pytest.raises(ValueError):
scorer.set_type_bonus(ContextType.KNOWLEDGE, -0.1)
class TestCompositeScorer:
"""Tests for CompositeScorer."""
def test_creation(self) -> None:
"""Test scorer creation with default weights."""
scorer = CompositeScorer()
weights = scorer.weights
assert weights["relevance"] == 0.5
assert weights["recency"] == 0.3
assert weights["priority"] == 0.2
def test_creation_with_custom_weights(self) -> None:
"""Test scorer creation with custom weights."""
scorer = CompositeScorer(
relevance_weight=0.6,
recency_weight=0.2,
priority_weight=0.2,
)
weights = scorer.weights
assert weights["relevance"] == 0.6
assert weights["recency"] == 0.2
assert weights["priority"] == 0.2
def test_update_weights(self) -> None:
"""Test updating weights."""
scorer = CompositeScorer()
scorer.update_weights(relevance=0.7, recency=0.2, priority=0.1)
weights = scorer.weights
assert weights["relevance"] == 0.7
assert weights["recency"] == 0.2
assert weights["priority"] == 0.1
def test_update_weights_partial(self) -> None:
"""Test partially updating weights."""
scorer = CompositeScorer()
original_recency = scorer.weights["recency"]
scorer.update_weights(relevance=0.8)
assert scorer.weights["relevance"] == 0.8
assert scorer.weights["recency"] == original_recency
@pytest.mark.asyncio
async def test_score_basic(self) -> None:
"""Test basic composite scoring."""
scorer = CompositeScorer()
context = KnowledgeContext(
content="Test content",
source="docs",
relevance_score=0.8,
timestamp=datetime.now(UTC),
priority=ContextPriority.NORMAL.value,
)
score = await scorer.score(context, "test query")
assert 0.0 <= score <= 1.0
@pytest.mark.asyncio
async def test_score_with_details(self) -> None:
"""Test scoring with detailed breakdown."""
scorer = CompositeScorer()
context = KnowledgeContext(
content="Test content",
source="docs",
relevance_score=0.8,
timestamp=datetime.now(UTC),
priority=ContextPriority.HIGH.value,
)
scored = await scorer.score_with_details(context, "test query")
assert isinstance(scored, ScoredContext)
assert scored.context is context
assert 0.0 <= scored.composite_score <= 1.0
assert scored.relevance_score == 0.8
assert scored.recency_score > 0.9 # Very recent
assert scored.priority_score > 0.5 # HIGH priority
@pytest.mark.asyncio
async def test_score_cached_on_context(self) -> None:
"""Test that score is cached on the context."""
scorer = CompositeScorer()
context = KnowledgeContext(
content="Test",
source="docs",
relevance_score=0.5,
)
# First scoring
await scorer.score(context, "query")
assert context._score is not None
# Second scoring should use cached value
context._score = 0.999 # Set to a known value
score2 = await scorer.score(context, "query")
assert score2 == 0.999
@pytest.mark.asyncio
async def test_score_batch(self) -> None:
"""Test batch scoring."""
scorer = CompositeScorer()
contexts = [
KnowledgeContext(
content="High relevance",
source="docs",
relevance_score=0.9,
),
KnowledgeContext(
content="Low relevance",
source="docs",
relevance_score=0.2,
),
]
scored = await scorer.score_batch(contexts, "query")
assert len(scored) == 2
assert scored[0].relevance_score > scored[1].relevance_score
@pytest.mark.asyncio
async def test_rank(self) -> None:
"""Test ranking contexts."""
scorer = CompositeScorer()
contexts = [
KnowledgeContext(
content="Low", source="docs", relevance_score=0.2
),
KnowledgeContext(
content="High", source="docs", relevance_score=0.9
),
KnowledgeContext(
content="Medium", source="docs", relevance_score=0.5
),
]
ranked = await scorer.rank(contexts, "query")
# Should be sorted by score (highest first)
assert len(ranked) == 3
assert ranked[0].relevance_score == 0.9
assert ranked[1].relevance_score == 0.5
assert ranked[2].relevance_score == 0.2
@pytest.mark.asyncio
async def test_rank_with_limit(self) -> None:
"""Test ranking with limit."""
scorer = CompositeScorer()
contexts = [
KnowledgeContext(
content=str(i), source="docs", relevance_score=i / 10
)
for i in range(10)
]
ranked = await scorer.rank(contexts, "query", limit=3)
assert len(ranked) == 3
@pytest.mark.asyncio
async def test_rank_with_min_score(self) -> None:
"""Test ranking with minimum score threshold."""
scorer = CompositeScorer()
contexts = [
KnowledgeContext(
content="Low", source="docs", relevance_score=0.1
),
KnowledgeContext(
content="High", source="docs", relevance_score=0.9
),
]
ranked = await scorer.rank(contexts, "query", min_score=0.5)
# Only the high relevance context should pass the threshold
assert len(ranked) <= 2 # Could be 1 if min_score filters
def test_set_mcp_manager(self) -> None:
"""Test setting MCP manager."""
scorer = CompositeScorer()
mock_mcp = MagicMock()
scorer.set_mcp_manager(mock_mcp)
assert scorer._relevance_scorer._mcp is mock_mcp
class TestScoredContext:
"""Tests for ScoredContext dataclass."""
def test_creation(self) -> None:
"""Test ScoredContext creation."""
context = TaskContext(content="Test", source="task")
scored = ScoredContext(
context=context,
composite_score=0.75,
relevance_score=0.8,
recency_score=0.7,
priority_score=0.5,
)
assert scored.context is context
assert scored.composite_score == 0.75
def test_comparison_operators(self) -> None:
"""Test comparison operators for sorting."""
ctx1 = TaskContext(content="1", source="task")
ctx2 = TaskContext(content="2", source="task")
scored1 = ScoredContext(context=ctx1, composite_score=0.5)
scored2 = ScoredContext(context=ctx2, composite_score=0.8)
assert scored1 < scored2
assert scored2 > scored1
def test_sorting(self) -> None:
"""Test sorting scored contexts."""
contexts = [
ScoredContext(
context=TaskContext(content="Low", source="task"),
composite_score=0.3,
),
ScoredContext(
context=TaskContext(content="High", source="task"),
composite_score=0.9,
),
ScoredContext(
context=TaskContext(content="Medium", source="task"),
composite_score=0.6,
),
]
sorted_contexts = sorted(contexts, reverse=True)
assert sorted_contexts[0].composite_score == 0.9
assert sorted_contexts[1].composite_score == 0.6
assert sorted_contexts[2].composite_score == 0.3
class TestBaseScorer:
"""Tests for BaseScorer abstract class."""
def test_weight_property(self) -> None:
"""Test weight property."""
# Use a concrete implementation
scorer = RelevanceScorer(weight=0.7)
assert scorer.weight == 0.7
def test_weight_setter_valid(self) -> None:
"""Test weight setter with valid values."""
scorer = RelevanceScorer()
scorer.weight = 0.5
assert scorer.weight == 0.5
def test_weight_setter_invalid(self) -> None:
"""Test weight setter with invalid values."""
scorer = RelevanceScorer()
with pytest.raises(ValueError):
scorer.weight = -0.1
with pytest.raises(ValueError):
scorer.weight = 1.5
def test_normalize_score(self) -> None:
"""Test score normalization."""
scorer = RelevanceScorer()
# Normal range
assert scorer.normalize_score(0.5) == 0.5
# Below 0
assert scorer.normalize_score(-0.5) == 0.0
# Above 1
assert scorer.normalize_score(1.5) == 1.0
# Boundaries
assert scorer.normalize_score(0.0) == 0.0
assert scorer.normalize_score(1.0) == 1.0