"""Tests for context scoring module.""" from datetime import UTC, datetime, timedelta from unittest.mock import AsyncMock, MagicMock import pytest from app.services.context.scoring import ( BaseScorer, CompositeScorer, PriorityScorer, RecencyScorer, RelevanceScorer, ScoredContext, ) from app.services.context.types import ( ContextPriority, ContextType, ConversationContext, KnowledgeContext, MessageRole, SystemContext, TaskContext, ) class TestRelevanceScorer: """Tests for RelevanceScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = RelevanceScorer() assert scorer.weight == 1.0 def test_creation_with_weight(self) -> None: """Test scorer creation with custom weight.""" scorer = RelevanceScorer(weight=0.5) assert scorer.weight == 0.5 @pytest.mark.asyncio async def test_score_with_precomputed_relevance(self) -> None: """Test scoring with pre-computed relevance score.""" scorer = RelevanceScorer() # KnowledgeContext with pre-computed score context = KnowledgeContext( content="Test content about Python", source="docs/python.md", relevance_score=0.85, ) score = await scorer.score(context, "Python programming") assert score == 0.85 @pytest.mark.asyncio async def test_score_with_metadata_score(self) -> None: """Test scoring with metadata-provided score.""" scorer = RelevanceScorer() context = SystemContext( content="System prompt", source="system", metadata={"relevance_score": 0.9}, ) score = await scorer.score(context, "anything") assert score == 0.9 @pytest.mark.asyncio async def test_score_fallback_to_keyword_matching(self) -> None: """Test fallback to keyword matching when no score available.""" scorer = RelevanceScorer(keyword_fallback_weight=0.5) context = TaskContext( content="Implement authentication with JWT tokens", source="task", ) # Query has matching keywords score = await scorer.score(context, "JWT authentication") assert score > 0 @pytest.mark.asyncio async def test_keyword_matching_no_overlap(self) -> None: """Test keyword matching with no query overlap.""" scorer = RelevanceScorer() context = TaskContext( content="Implement database migration", source="task", ) score = await scorer.score(context, "xyz abc 123") assert score == 0.0 @pytest.mark.asyncio async def test_keyword_matching_full_overlap(self) -> None: """Test keyword matching with high overlap.""" scorer = RelevanceScorer(keyword_fallback_weight=1.0) context = TaskContext( content="python programming language", source="task", ) score = await scorer.score(context, "python programming") # Should have high score due to keyword overlap assert score > 0.5 @pytest.mark.asyncio async def test_score_with_mcp_success(self) -> None: """Test scoring with MCP semantic similarity.""" mock_mcp = MagicMock() mock_result = MagicMock() mock_result.success = True mock_result.data = {"similarity": 0.75} mock_mcp.call_tool = AsyncMock(return_value=mock_result) scorer = RelevanceScorer(mcp_manager=mock_mcp) context = TaskContext( content="Test content", source="task", ) score = await scorer.score(context, "test query") assert score == 0.75 @pytest.mark.asyncio async def test_score_with_mcp_failure_fallback(self) -> None: """Test fallback when MCP fails.""" mock_mcp = MagicMock() mock_mcp.call_tool = AsyncMock(side_effect=Exception("Connection failed")) scorer = RelevanceScorer(mcp_manager=mock_mcp, keyword_fallback_weight=0.5) context = TaskContext( content="Python programming code", source="task", ) # Should fall back to keyword matching score = await scorer.score(context, "Python code") assert score > 0 @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = RelevanceScorer() contexts = [ KnowledgeContext( content="Python", source="1", relevance_score=0.8 ), KnowledgeContext( content="Java", source="2", relevance_score=0.6 ), KnowledgeContext( content="Go", source="3", relevance_score=0.9 ), ] scores = await scorer.score_batch(contexts, "test") assert len(scores) == 3 assert scores[0] == 0.8 assert scores[1] == 0.6 assert scores[2] == 0.9 def test_set_mcp_manager(self) -> None: """Test setting MCP manager.""" scorer = RelevanceScorer() assert scorer._mcp is None mock_mcp = MagicMock() scorer.set_mcp_manager(mock_mcp) assert scorer._mcp is mock_mcp class TestRecencyScorer: """Tests for RecencyScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = RecencyScorer() assert scorer.weight == 1.0 assert scorer._half_life_hours == 24.0 def test_creation_with_custom_half_life(self) -> None: """Test scorer creation with custom half-life.""" scorer = RecencyScorer(half_life_hours=12.0) assert scorer._half_life_hours == 12.0 @pytest.mark.asyncio async def test_score_recent_context(self) -> None: """Test scoring a very recent context.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) context = TaskContext( content="Recent task", source="task", timestamp=now, ) score = await scorer.score(context, "query", reference_time=now) # Very recent should have score near 1.0 assert score > 0.99 @pytest.mark.asyncio async def test_score_at_half_life(self) -> None: """Test scoring at exactly half-life age.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) half_life_ago = now - timedelta(hours=24) context = TaskContext( content="Day old task", source="task", timestamp=half_life_ago, ) score = await scorer.score(context, "query", reference_time=now) # At half-life, score should be ~0.5 assert 0.49 <= score <= 0.51 @pytest.mark.asyncio async def test_score_old_context(self) -> None: """Test scoring a very old context.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) week_ago = now - timedelta(days=7) context = TaskContext( content="Week old task", source="task", timestamp=week_ago, ) score = await scorer.score(context, "query", reference_time=now) # 7 days with 24h half-life = very low score assert score < 0.01 @pytest.mark.asyncio async def test_type_specific_half_lives(self) -> None: """Test that different context types have different half-lives.""" scorer = RecencyScorer() now = datetime.now(UTC) one_hour_ago = now - timedelta(hours=1) # Conversation has 1 hour half-life by default conv_context = ConversationContext( content="Hello", source="chat", role=MessageRole.USER, timestamp=one_hour_ago, ) # Knowledge has 168 hour (1 week) half-life by default knowledge_context = KnowledgeContext( content="Documentation", source="docs", timestamp=one_hour_ago, ) conv_score = await scorer.score(conv_context, "query", reference_time=now) knowledge_score = await scorer.score(knowledge_context, "query", reference_time=now) # Conversation should decay much faster assert conv_score < knowledge_score def test_get_half_life(self) -> None: """Test getting half-life for context type.""" scorer = RecencyScorer() assert scorer.get_half_life(ContextType.CONVERSATION) == 1.0 assert scorer.get_half_life(ContextType.KNOWLEDGE) == 168.0 assert scorer.get_half_life(ContextType.SYSTEM) == 720.0 def test_set_half_life(self) -> None: """Test setting custom half-life.""" scorer = RecencyScorer() scorer.set_half_life(ContextType.TASK, 48.0) assert scorer.get_half_life(ContextType.TASK) == 48.0 def test_set_half_life_invalid(self) -> None: """Test setting invalid half-life.""" scorer = RecencyScorer() with pytest.raises(ValueError): scorer.set_half_life(ContextType.TASK, 0) with pytest.raises(ValueError): scorer.set_half_life(ContextType.TASK, -1) @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = RecencyScorer() now = datetime.now(UTC) contexts = [ TaskContext(content="1", source="t", timestamp=now), TaskContext( content="2", source="t", timestamp=now - timedelta(hours=24) ), TaskContext( content="3", source="t", timestamp=now - timedelta(hours=48) ), ] scores = await scorer.score_batch(contexts, "query", reference_time=now) assert len(scores) == 3 # Scores should be in descending order (more recent = higher) assert scores[0] > scores[1] > scores[2] class TestPriorityScorer: """Tests for PriorityScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = PriorityScorer() assert scorer.weight == 1.0 @pytest.mark.asyncio async def test_score_critical_priority(self) -> None: """Test scoring CRITICAL priority context.""" scorer = PriorityScorer() context = SystemContext( content="Critical system prompt", source="system", priority=ContextPriority.CRITICAL.value, ) score = await scorer.score(context, "query") # CRITICAL (100) + type bonus should be > 1.0, normalized to 1.0 assert score == 1.0 @pytest.mark.asyncio async def test_score_normal_priority(self) -> None: """Test scoring NORMAL priority context.""" scorer = PriorityScorer() context = TaskContext( content="Normal task", source="task", priority=ContextPriority.NORMAL.value, ) score = await scorer.score(context, "query") # NORMAL (50) = 0.5, plus TASK bonus (0.15) = 0.65 assert 0.6 <= score <= 0.7 @pytest.mark.asyncio async def test_score_low_priority(self) -> None: """Test scoring LOW priority context.""" scorer = PriorityScorer() context = KnowledgeContext( content="Low priority knowledge", source="docs", priority=ContextPriority.LOW.value, ) score = await scorer.score(context, "query") # LOW (20) = 0.2, no bonus for KNOWLEDGE assert 0.15 <= score <= 0.25 @pytest.mark.asyncio async def test_type_bonuses(self) -> None: """Test type-specific priority bonuses.""" scorer = PriorityScorer() # All with same base priority system_ctx = SystemContext( content="System", source="system", priority=50, ) task_ctx = TaskContext( content="Task", source="task", priority=50, ) knowledge_ctx = KnowledgeContext( content="Knowledge", source="docs", priority=50, ) system_score = await scorer.score(system_ctx, "query") task_score = await scorer.score(task_ctx, "query") knowledge_score = await scorer.score(knowledge_ctx, "query") # System has highest bonus (0.2), task next (0.15), knowledge has none assert system_score > task_score > knowledge_score def test_get_type_bonus(self) -> None: """Test getting type bonus.""" scorer = PriorityScorer() assert scorer.get_type_bonus(ContextType.SYSTEM) == 0.2 assert scorer.get_type_bonus(ContextType.TASK) == 0.15 assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.0 def test_set_type_bonus(self) -> None: """Test setting custom type bonus.""" scorer = PriorityScorer() scorer.set_type_bonus(ContextType.KNOWLEDGE, 0.1) assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.1 def test_set_type_bonus_invalid(self) -> None: """Test setting invalid type bonus.""" scorer = PriorityScorer() with pytest.raises(ValueError): scorer.set_type_bonus(ContextType.KNOWLEDGE, 1.5) with pytest.raises(ValueError): scorer.set_type_bonus(ContextType.KNOWLEDGE, -0.1) class TestCompositeScorer: """Tests for CompositeScorer.""" def test_creation(self) -> None: """Test scorer creation with default weights.""" scorer = CompositeScorer() weights = scorer.weights assert weights["relevance"] == 0.5 assert weights["recency"] == 0.3 assert weights["priority"] == 0.2 def test_creation_with_custom_weights(self) -> None: """Test scorer creation with custom weights.""" scorer = CompositeScorer( relevance_weight=0.6, recency_weight=0.2, priority_weight=0.2, ) weights = scorer.weights assert weights["relevance"] == 0.6 assert weights["recency"] == 0.2 assert weights["priority"] == 0.2 def test_update_weights(self) -> None: """Test updating weights.""" scorer = CompositeScorer() scorer.update_weights(relevance=0.7, recency=0.2, priority=0.1) weights = scorer.weights assert weights["relevance"] == 0.7 assert weights["recency"] == 0.2 assert weights["priority"] == 0.1 def test_update_weights_partial(self) -> None: """Test partially updating weights.""" scorer = CompositeScorer() original_recency = scorer.weights["recency"] scorer.update_weights(relevance=0.8) assert scorer.weights["relevance"] == 0.8 assert scorer.weights["recency"] == original_recency @pytest.mark.asyncio async def test_score_basic(self) -> None: """Test basic composite scoring.""" scorer = CompositeScorer() context = KnowledgeContext( content="Test content", source="docs", relevance_score=0.8, timestamp=datetime.now(UTC), priority=ContextPriority.NORMAL.value, ) score = await scorer.score(context, "test query") assert 0.0 <= score <= 1.0 @pytest.mark.asyncio async def test_score_with_details(self) -> None: """Test scoring with detailed breakdown.""" scorer = CompositeScorer() context = KnowledgeContext( content="Test content", source="docs", relevance_score=0.8, timestamp=datetime.now(UTC), priority=ContextPriority.HIGH.value, ) scored = await scorer.score_with_details(context, "test query") assert isinstance(scored, ScoredContext) assert scored.context is context assert 0.0 <= scored.composite_score <= 1.0 assert scored.relevance_score == 0.8 assert scored.recency_score > 0.9 # Very recent assert scored.priority_score > 0.5 # HIGH priority @pytest.mark.asyncio async def test_score_cached_on_context(self) -> None: """Test that score is cached on the context.""" scorer = CompositeScorer() context = KnowledgeContext( content="Test", source="docs", relevance_score=0.5, ) # First scoring await scorer.score(context, "query") assert context._score is not None # Second scoring should use cached value context._score = 0.999 # Set to a known value score2 = await scorer.score(context, "query") assert score2 == 0.999 @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = CompositeScorer() contexts = [ KnowledgeContext( content="High relevance", source="docs", relevance_score=0.9, ), KnowledgeContext( content="Low relevance", source="docs", relevance_score=0.2, ), ] scored = await scorer.score_batch(contexts, "query") assert len(scored) == 2 assert scored[0].relevance_score > scored[1].relevance_score @pytest.mark.asyncio async def test_rank(self) -> None: """Test ranking contexts.""" scorer = CompositeScorer() contexts = [ KnowledgeContext( content="Low", source="docs", relevance_score=0.2 ), KnowledgeContext( content="High", source="docs", relevance_score=0.9 ), KnowledgeContext( content="Medium", source="docs", relevance_score=0.5 ), ] ranked = await scorer.rank(contexts, "query") # Should be sorted by score (highest first) assert len(ranked) == 3 assert ranked[0].relevance_score == 0.9 assert ranked[1].relevance_score == 0.5 assert ranked[2].relevance_score == 0.2 @pytest.mark.asyncio async def test_rank_with_limit(self) -> None: """Test ranking with limit.""" scorer = CompositeScorer() contexts = [ KnowledgeContext( content=str(i), source="docs", relevance_score=i / 10 ) for i in range(10) ] ranked = await scorer.rank(contexts, "query", limit=3) assert len(ranked) == 3 @pytest.mark.asyncio async def test_rank_with_min_score(self) -> None: """Test ranking with minimum score threshold.""" scorer = CompositeScorer() contexts = [ KnowledgeContext( content="Low", source="docs", relevance_score=0.1 ), KnowledgeContext( content="High", source="docs", relevance_score=0.9 ), ] ranked = await scorer.rank(contexts, "query", min_score=0.5) # Only the high relevance context should pass the threshold assert len(ranked) <= 2 # Could be 1 if min_score filters def test_set_mcp_manager(self) -> None: """Test setting MCP manager.""" scorer = CompositeScorer() mock_mcp = MagicMock() scorer.set_mcp_manager(mock_mcp) assert scorer._relevance_scorer._mcp is mock_mcp class TestScoredContext: """Tests for ScoredContext dataclass.""" def test_creation(self) -> None: """Test ScoredContext creation.""" context = TaskContext(content="Test", source="task") scored = ScoredContext( context=context, composite_score=0.75, relevance_score=0.8, recency_score=0.7, priority_score=0.5, ) assert scored.context is context assert scored.composite_score == 0.75 def test_comparison_operators(self) -> None: """Test comparison operators for sorting.""" ctx1 = TaskContext(content="1", source="task") ctx2 = TaskContext(content="2", source="task") scored1 = ScoredContext(context=ctx1, composite_score=0.5) scored2 = ScoredContext(context=ctx2, composite_score=0.8) assert scored1 < scored2 assert scored2 > scored1 def test_sorting(self) -> None: """Test sorting scored contexts.""" contexts = [ ScoredContext( context=TaskContext(content="Low", source="task"), composite_score=0.3, ), ScoredContext( context=TaskContext(content="High", source="task"), composite_score=0.9, ), ScoredContext( context=TaskContext(content="Medium", source="task"), composite_score=0.6, ), ] sorted_contexts = sorted(contexts, reverse=True) assert sorted_contexts[0].composite_score == 0.9 assert sorted_contexts[1].composite_score == 0.6 assert sorted_contexts[2].composite_score == 0.3 class TestBaseScorer: """Tests for BaseScorer abstract class.""" def test_weight_property(self) -> None: """Test weight property.""" # Use a concrete implementation scorer = RelevanceScorer(weight=0.7) assert scorer.weight == 0.7 def test_weight_setter_valid(self) -> None: """Test weight setter with valid values.""" scorer = RelevanceScorer() scorer.weight = 0.5 assert scorer.weight == 0.5 def test_weight_setter_invalid(self) -> None: """Test weight setter with invalid values.""" scorer = RelevanceScorer() with pytest.raises(ValueError): scorer.weight = -0.1 with pytest.raises(ValueError): scorer.weight = 1.5 def test_normalize_score(self) -> None: """Test score normalization.""" scorer = RelevanceScorer() # Normal range assert scorer.normalize_score(0.5) == 0.5 # Below 0 assert scorer.normalize_score(-0.5) == 0.0 # Above 1 assert scorer.normalize_score(1.5) == 1.0 # Boundaries assert scorer.normalize_score(0.0) == 0.0 assert scorer.normalize_score(1.0) == 1.0