"""Tests for context scoring module.""" from datetime import UTC, datetime, timedelta from unittest.mock import AsyncMock, MagicMock import pytest from app.services.context.scoring import ( CompositeScorer, PriorityScorer, RecencyScorer, RelevanceScorer, ScoredContext, ) from app.services.context.types import ( ContextPriority, ContextType, ConversationContext, KnowledgeContext, MessageRole, SystemContext, TaskContext, ) class TestRelevanceScorer: """Tests for RelevanceScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = RelevanceScorer() assert scorer.weight == 1.0 def test_creation_with_weight(self) -> None: """Test scorer creation with custom weight.""" scorer = RelevanceScorer(weight=0.5) assert scorer.weight == 0.5 @pytest.mark.asyncio async def test_score_with_precomputed_relevance(self) -> None: """Test scoring with pre-computed relevance score.""" scorer = RelevanceScorer() # KnowledgeContext with pre-computed score context = KnowledgeContext( content="Test content about Python", source="docs/python.md", relevance_score=0.85, ) score = await scorer.score(context, "Python programming") assert score == 0.85 @pytest.mark.asyncio async def test_score_with_metadata_score(self) -> None: """Test scoring with metadata-provided score.""" scorer = RelevanceScorer() context = SystemContext( content="System prompt", source="system", metadata={"relevance_score": 0.9}, ) score = await scorer.score(context, "anything") assert score == 0.9 @pytest.mark.asyncio async def test_score_fallback_to_keyword_matching(self) -> None: """Test fallback to keyword matching when no score available.""" scorer = RelevanceScorer(keyword_fallback_weight=0.5) context = TaskContext( content="Implement authentication with JWT tokens", source="task", ) # Query has matching keywords score = await scorer.score(context, "JWT authentication") assert score > 0 @pytest.mark.asyncio async def test_keyword_matching_no_overlap(self) -> None: """Test keyword matching with no query overlap.""" scorer = RelevanceScorer() context = TaskContext( content="Implement database migration", source="task", ) score = await scorer.score(context, "xyz abc 123") assert score == 0.0 @pytest.mark.asyncio async def test_keyword_matching_full_overlap(self) -> None: """Test keyword matching with high overlap.""" scorer = RelevanceScorer(keyword_fallback_weight=1.0) context = TaskContext( content="python programming language", source="task", ) score = await scorer.score(context, "python programming") # Should have high score due to keyword overlap assert score > 0.5 @pytest.mark.asyncio async def test_score_with_mcp_success(self) -> None: """Test scoring with MCP semantic similarity.""" mock_mcp = MagicMock() mock_result = MagicMock() mock_result.success = True mock_result.data = {"similarity": 0.75} mock_mcp.call_tool = AsyncMock(return_value=mock_result) scorer = RelevanceScorer(mcp_manager=mock_mcp) context = TaskContext( content="Test content", source="task", ) score = await scorer.score(context, "test query") assert score == 0.75 @pytest.mark.asyncio async def test_score_with_mcp_failure_fallback(self) -> None: """Test fallback when MCP fails.""" mock_mcp = MagicMock() mock_mcp.call_tool = AsyncMock(side_effect=Exception("Connection failed")) scorer = RelevanceScorer(mcp_manager=mock_mcp, keyword_fallback_weight=0.5) context = TaskContext( content="Python programming code", source="task", ) # Should fall back to keyword matching score = await scorer.score(context, "Python code") assert score > 0 @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = RelevanceScorer() contexts = [ KnowledgeContext(content="Python", source="1", relevance_score=0.8), KnowledgeContext(content="Java", source="2", relevance_score=0.6), KnowledgeContext(content="Go", source="3", relevance_score=0.9), ] scores = await scorer.score_batch(contexts, "test") assert len(scores) == 3 assert scores[0] == 0.8 assert scores[1] == 0.6 assert scores[2] == 0.9 def test_set_mcp_manager(self) -> None: """Test setting MCP manager.""" scorer = RelevanceScorer() assert scorer._mcp is None mock_mcp = MagicMock() scorer.set_mcp_manager(mock_mcp) assert scorer._mcp is mock_mcp class TestRecencyScorer: """Tests for RecencyScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = RecencyScorer() assert scorer.weight == 1.0 assert scorer._half_life_hours == 24.0 def test_creation_with_custom_half_life(self) -> None: """Test scorer creation with custom half-life.""" scorer = RecencyScorer(half_life_hours=12.0) assert scorer._half_life_hours == 12.0 @pytest.mark.asyncio async def test_score_recent_context(self) -> None: """Test scoring a very recent context.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) context = TaskContext( content="Recent task", source="task", timestamp=now, ) score = await scorer.score(context, "query", reference_time=now) # Very recent should have score near 1.0 assert score > 0.99 @pytest.mark.asyncio async def test_score_at_half_life(self) -> None: """Test scoring at exactly half-life age.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) half_life_ago = now - timedelta(hours=24) context = TaskContext( content="Day old task", source="task", timestamp=half_life_ago, ) score = await scorer.score(context, "query", reference_time=now) # At half-life, score should be ~0.5 assert 0.49 <= score <= 0.51 @pytest.mark.asyncio async def test_score_old_context(self) -> None: """Test scoring a very old context.""" scorer = RecencyScorer(half_life_hours=24.0) now = datetime.now(UTC) week_ago = now - timedelta(days=7) context = TaskContext( content="Week old task", source="task", timestamp=week_ago, ) score = await scorer.score(context, "query", reference_time=now) # 7 days with 24h half-life = very low score assert score < 0.01 @pytest.mark.asyncio async def test_type_specific_half_lives(self) -> None: """Test that different context types have different half-lives.""" scorer = RecencyScorer() now = datetime.now(UTC) one_hour_ago = now - timedelta(hours=1) # Conversation has 1 hour half-life by default conv_context = ConversationContext( content="Hello", source="chat", role=MessageRole.USER, timestamp=one_hour_ago, ) # Knowledge has 168 hour (1 week) half-life by default knowledge_context = KnowledgeContext( content="Documentation", source="docs", timestamp=one_hour_ago, ) conv_score = await scorer.score(conv_context, "query", reference_time=now) knowledge_score = await scorer.score( knowledge_context, "query", reference_time=now ) # Conversation should decay much faster assert conv_score < knowledge_score def test_get_half_life(self) -> None: """Test getting half-life for context type.""" scorer = RecencyScorer() assert scorer.get_half_life(ContextType.CONVERSATION) == 1.0 assert scorer.get_half_life(ContextType.KNOWLEDGE) == 168.0 assert scorer.get_half_life(ContextType.SYSTEM) == 720.0 def test_set_half_life(self) -> None: """Test setting custom half-life.""" scorer = RecencyScorer() scorer.set_half_life(ContextType.TASK, 48.0) assert scorer.get_half_life(ContextType.TASK) == 48.0 def test_set_half_life_invalid(self) -> None: """Test setting invalid half-life.""" scorer = RecencyScorer() with pytest.raises(ValueError): scorer.set_half_life(ContextType.TASK, 0) with pytest.raises(ValueError): scorer.set_half_life(ContextType.TASK, -1) @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = RecencyScorer() now = datetime.now(UTC) contexts = [ TaskContext(content="1", source="t", timestamp=now), TaskContext(content="2", source="t", timestamp=now - timedelta(hours=24)), TaskContext(content="3", source="t", timestamp=now - timedelta(hours=48)), ] scores = await scorer.score_batch(contexts, "query", reference_time=now) assert len(scores) == 3 # Scores should be in descending order (more recent = higher) assert scores[0] > scores[1] > scores[2] class TestPriorityScorer: """Tests for PriorityScorer.""" def test_creation(self) -> None: """Test scorer creation.""" scorer = PriorityScorer() assert scorer.weight == 1.0 @pytest.mark.asyncio async def test_score_critical_priority(self) -> None: """Test scoring CRITICAL priority context.""" scorer = PriorityScorer() context = SystemContext( content="Critical system prompt", source="system", priority=ContextPriority.CRITICAL.value, ) score = await scorer.score(context, "query") # CRITICAL (100) + type bonus should be > 1.0, normalized to 1.0 assert score == 1.0 @pytest.mark.asyncio async def test_score_normal_priority(self) -> None: """Test scoring NORMAL priority context.""" scorer = PriorityScorer() context = TaskContext( content="Normal task", source="task", priority=ContextPriority.NORMAL.value, ) score = await scorer.score(context, "query") # NORMAL (50) = 0.5, plus TASK bonus (0.15) = 0.65 assert 0.6 <= score <= 0.7 @pytest.mark.asyncio async def test_score_low_priority(self) -> None: """Test scoring LOW priority context.""" scorer = PriorityScorer() context = KnowledgeContext( content="Low priority knowledge", source="docs", priority=ContextPriority.LOW.value, ) score = await scorer.score(context, "query") # LOW (20) = 0.2, no bonus for KNOWLEDGE assert 0.15 <= score <= 0.25 @pytest.mark.asyncio async def test_type_bonuses(self) -> None: """Test type-specific priority bonuses.""" scorer = PriorityScorer() # All with same base priority system_ctx = SystemContext( content="System", source="system", priority=50, ) task_ctx = TaskContext( content="Task", source="task", priority=50, ) knowledge_ctx = KnowledgeContext( content="Knowledge", source="docs", priority=50, ) system_score = await scorer.score(system_ctx, "query") task_score = await scorer.score(task_ctx, "query") knowledge_score = await scorer.score(knowledge_ctx, "query") # System has highest bonus (0.2), task next (0.15), knowledge has none assert system_score > task_score > knowledge_score def test_get_type_bonus(self) -> None: """Test getting type bonus.""" scorer = PriorityScorer() assert scorer.get_type_bonus(ContextType.SYSTEM) == 0.2 assert scorer.get_type_bonus(ContextType.TASK) == 0.15 assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.0 def test_set_type_bonus(self) -> None: """Test setting custom type bonus.""" scorer = PriorityScorer() scorer.set_type_bonus(ContextType.KNOWLEDGE, 0.1) assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.1 def test_set_type_bonus_invalid(self) -> None: """Test setting invalid type bonus.""" scorer = PriorityScorer() with pytest.raises(ValueError): scorer.set_type_bonus(ContextType.KNOWLEDGE, 1.5) with pytest.raises(ValueError): scorer.set_type_bonus(ContextType.KNOWLEDGE, -0.1) class TestCompositeScorer: """Tests for CompositeScorer.""" def test_creation(self) -> None: """Test scorer creation with default weights.""" scorer = CompositeScorer() weights = scorer.weights assert weights["relevance"] == 0.5 assert weights["recency"] == 0.3 assert weights["priority"] == 0.2 def test_creation_with_custom_weights(self) -> None: """Test scorer creation with custom weights.""" scorer = CompositeScorer( relevance_weight=0.6, recency_weight=0.2, priority_weight=0.2, ) weights = scorer.weights assert weights["relevance"] == 0.6 assert weights["recency"] == 0.2 assert weights["priority"] == 0.2 def test_update_weights(self) -> None: """Test updating weights.""" scorer = CompositeScorer() scorer.update_weights(relevance=0.7, recency=0.2, priority=0.1) weights = scorer.weights assert weights["relevance"] == 0.7 assert weights["recency"] == 0.2 assert weights["priority"] == 0.1 def test_update_weights_partial(self) -> None: """Test partially updating weights.""" scorer = CompositeScorer() original_recency = scorer.weights["recency"] scorer.update_weights(relevance=0.8) assert scorer.weights["relevance"] == 0.8 assert scorer.weights["recency"] == original_recency @pytest.mark.asyncio async def test_score_basic(self) -> None: """Test basic composite scoring.""" scorer = CompositeScorer() context = KnowledgeContext( content="Test content", source="docs", relevance_score=0.8, timestamp=datetime.now(UTC), priority=ContextPriority.NORMAL.value, ) score = await scorer.score(context, "test query") assert 0.0 <= score <= 1.0 @pytest.mark.asyncio async def test_score_with_details(self) -> None: """Test scoring with detailed breakdown.""" scorer = CompositeScorer() context = KnowledgeContext( content="Test content", source="docs", relevance_score=0.8, timestamp=datetime.now(UTC), priority=ContextPriority.HIGH.value, ) scored = await scorer.score_with_details(context, "test query") assert isinstance(scored, ScoredContext) assert scored.context is context assert 0.0 <= scored.composite_score <= 1.0 assert scored.relevance_score == 0.8 assert scored.recency_score > 0.9 # Very recent assert scored.priority_score > 0.5 # HIGH priority @pytest.mark.asyncio async def test_score_not_cached_on_context(self) -> None: """Test that scores are NOT cached on the context. Scores should not be cached on the context because they are query-dependent. Different queries would get incorrect cached scores if we cached on the context. """ scorer = CompositeScorer() context = KnowledgeContext( content="Test", source="docs", relevance_score=0.5, ) # After scoring, context._score should remain None # (we don't cache on context because scores are query-dependent) await scorer.score(context, "query") # The scorer should compute fresh scores each time # rather than caching on the context object # Score again with different query - should compute fresh score score1 = await scorer.score(context, "query 1") score2 = await scorer.score(context, "query 2") # Both should be valid scores (not necessarily equal since queries differ) assert 0.0 <= score1 <= 1.0 assert 0.0 <= score2 <= 1.0 @pytest.mark.asyncio async def test_score_batch(self) -> None: """Test batch scoring.""" scorer = CompositeScorer() contexts = [ KnowledgeContext( content="High relevance", source="docs", relevance_score=0.9, ), KnowledgeContext( content="Low relevance", source="docs", relevance_score=0.2, ), ] scored = await scorer.score_batch(contexts, "query") assert len(scored) == 2 assert scored[0].relevance_score > scored[1].relevance_score @pytest.mark.asyncio async def test_rank(self) -> None: """Test ranking contexts.""" scorer = CompositeScorer() contexts = [ KnowledgeContext(content="Low", source="docs", relevance_score=0.2), KnowledgeContext(content="High", source="docs", relevance_score=0.9), KnowledgeContext(content="Medium", source="docs", relevance_score=0.5), ] ranked = await scorer.rank(contexts, "query") # Should be sorted by score (highest first) assert len(ranked) == 3 assert ranked[0].relevance_score == 0.9 assert ranked[1].relevance_score == 0.5 assert ranked[2].relevance_score == 0.2 @pytest.mark.asyncio async def test_rank_with_limit(self) -> None: """Test ranking with limit.""" scorer = CompositeScorer() contexts = [ KnowledgeContext(content=str(i), source="docs", relevance_score=i / 10) for i in range(10) ] ranked = await scorer.rank(contexts, "query", limit=3) assert len(ranked) == 3 @pytest.mark.asyncio async def test_rank_with_min_score(self) -> None: """Test ranking with minimum score threshold.""" scorer = CompositeScorer() contexts = [ KnowledgeContext(content="Low", source="docs", relevance_score=0.1), KnowledgeContext(content="High", source="docs", relevance_score=0.9), ] ranked = await scorer.rank(contexts, "query", min_score=0.5) # Only the high relevance context should pass the threshold assert len(ranked) <= 2 # Could be 1 if min_score filters def test_set_mcp_manager(self) -> None: """Test setting MCP manager.""" scorer = CompositeScorer() mock_mcp = MagicMock() scorer.set_mcp_manager(mock_mcp) assert scorer._relevance_scorer._mcp is mock_mcp @pytest.mark.asyncio async def test_concurrent_scoring_same_context_no_race(self) -> None: """Test that concurrent scoring of the same context doesn't cause race conditions. This verifies that the per-context locking mechanism prevents the same context from being scored multiple times when scored concurrently. """ import asyncio # Use scorer with recency_weight=0 to eliminate time-dependent variation # (recency scores change as time passes between calls) scorer = CompositeScorer( relevance_weight=0.5, recency_weight=0.0, # Disable recency to get deterministic results priority_weight=0.5, ) # Create a single context that will be scored multiple times concurrently context = KnowledgeContext( content="Test content for race condition test", source="docs", relevance_score=0.75, ) # Score the same context many times in parallel num_concurrent = 50 tasks = [scorer.score(context, "test query") for _ in range(num_concurrent)] scores = await asyncio.gather(*tasks) # All scores should be identical (deterministic scoring without recency) assert all(s == scores[0] for s in scores) # Note: We don't cache _score on context because scores are query-dependent @pytest.mark.asyncio async def test_concurrent_scoring_different_contexts(self) -> None: """Test that concurrent scoring of different contexts works correctly. Different contexts should not interfere with each other during parallel scoring. """ import asyncio scorer = CompositeScorer() # Create many different contexts contexts = [ KnowledgeContext( content=f"Test content {i}", source="docs", relevance_score=i / 10, ) for i in range(10) ] # Score all contexts concurrently tasks = [scorer.score(ctx, "test query") for ctx in contexts] scores = await asyncio.gather(*tasks) # Each context should have a different score based on its relevance assert len(set(scores)) > 1 # Not all the same # Note: We don't cache _score on context because scores are query-dependent class TestScoredContext: """Tests for ScoredContext dataclass.""" def test_creation(self) -> None: """Test ScoredContext creation.""" context = TaskContext(content="Test", source="task") scored = ScoredContext( context=context, composite_score=0.75, relevance_score=0.8, recency_score=0.7, priority_score=0.5, ) assert scored.context is context assert scored.composite_score == 0.75 def test_comparison_operators(self) -> None: """Test comparison operators for sorting.""" ctx1 = TaskContext(content="1", source="task") ctx2 = TaskContext(content="2", source="task") scored1 = ScoredContext(context=ctx1, composite_score=0.5) scored2 = ScoredContext(context=ctx2, composite_score=0.8) assert scored1 < scored2 assert scored2 > scored1 def test_sorting(self) -> None: """Test sorting scored contexts.""" contexts = [ ScoredContext( context=TaskContext(content="Low", source="task"), composite_score=0.3, ), ScoredContext( context=TaskContext(content="High", source="task"), composite_score=0.9, ), ScoredContext( context=TaskContext(content="Medium", source="task"), composite_score=0.6, ), ] sorted_contexts = sorted(contexts, reverse=True) assert sorted_contexts[0].composite_score == 0.9 assert sorted_contexts[1].composite_score == 0.6 assert sorted_contexts[2].composite_score == 0.3 class TestBaseScorer: """Tests for BaseScorer abstract class.""" def test_weight_property(self) -> None: """Test weight property.""" # Use a concrete implementation scorer = RelevanceScorer(weight=0.7) assert scorer.weight == 0.7 def test_weight_setter_valid(self) -> None: """Test weight setter with valid values.""" scorer = RelevanceScorer() scorer.weight = 0.5 assert scorer.weight == 0.5 def test_weight_setter_invalid(self) -> None: """Test weight setter with invalid values.""" scorer = RelevanceScorer() with pytest.raises(ValueError): scorer.weight = -0.1 with pytest.raises(ValueError): scorer.weight = 1.5 def test_normalize_score(self) -> None: """Test score normalization.""" scorer = RelevanceScorer() # Normal range assert scorer.normalize_score(0.5) == 0.5 # Below 0 assert scorer.normalize_score(-0.5) == 0.0 # Above 1 assert scorer.normalize_score(1.5) == 1.0 # Boundaries assert scorer.normalize_score(0.0) == 0.0 assert scorer.normalize_score(1.0) == 1.0