forked from cardosofelipe/fast-next-template
Add tests to improve backend coverage from 85% to 93%: - test_audit.py: 60 tests for AuditLogger (20% -> 99%) - Hash chain integrity, sanitization, retention, handlers - Fixed bug: hash chain modification after event creation - Fixed bug: verification not using correct prev_hash - test_hitl.py: Tests for HITL manager (0% -> 100%) - test_permissions.py: Tests for permissions manager (0% -> 99%) - test_rollback.py: Tests for rollback manager (0% -> 100%) - test_metrics.py: Tests for metrics collector (0% -> 100%) - test_mcp_integration.py: Tests for MCP safety wrapper (0% -> 100%) - test_validation.py: Additional cache and edge case tests (76% -> 100%) - test_scoring.py: Lock cleanup and edge case tests (78% -> 91%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
894 lines
29 KiB
Python
894 lines
29 KiB
Python
"""Tests for context scoring module."""
|
|
|
|
from datetime import UTC, datetime, timedelta
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from app.services.context.scoring import (
|
|
CompositeScorer,
|
|
PriorityScorer,
|
|
RecencyScorer,
|
|
RelevanceScorer,
|
|
ScoredContext,
|
|
)
|
|
from app.services.context.types import (
|
|
ContextPriority,
|
|
ContextType,
|
|
ConversationContext,
|
|
KnowledgeContext,
|
|
MessageRole,
|
|
SystemContext,
|
|
TaskContext,
|
|
)
|
|
|
|
|
|
class TestRelevanceScorer:
|
|
"""Tests for RelevanceScorer."""
|
|
|
|
def test_creation(self) -> None:
|
|
"""Test scorer creation."""
|
|
scorer = RelevanceScorer()
|
|
assert scorer.weight == 1.0
|
|
|
|
def test_creation_with_weight(self) -> None:
|
|
"""Test scorer creation with custom weight."""
|
|
scorer = RelevanceScorer(weight=0.5)
|
|
assert scorer.weight == 0.5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_precomputed_relevance(self) -> None:
|
|
"""Test scoring with pre-computed relevance score."""
|
|
scorer = RelevanceScorer()
|
|
|
|
# KnowledgeContext with pre-computed score
|
|
context = KnowledgeContext(
|
|
content="Test content about Python",
|
|
source="docs/python.md",
|
|
relevance_score=0.85,
|
|
)
|
|
|
|
score = await scorer.score(context, "Python programming")
|
|
assert score == 0.85
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_metadata_score(self) -> None:
|
|
"""Test scoring with metadata-provided score."""
|
|
scorer = RelevanceScorer()
|
|
|
|
context = SystemContext(
|
|
content="System prompt",
|
|
source="system",
|
|
metadata={"relevance_score": 0.9},
|
|
)
|
|
|
|
score = await scorer.score(context, "anything")
|
|
assert score == 0.9
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_fallback_to_keyword_matching(self) -> None:
|
|
"""Test fallback to keyword matching when no score available."""
|
|
scorer = RelevanceScorer(keyword_fallback_weight=0.5)
|
|
|
|
context = TaskContext(
|
|
content="Implement authentication with JWT tokens",
|
|
source="task",
|
|
)
|
|
|
|
# Query has matching keywords
|
|
score = await scorer.score(context, "JWT authentication")
|
|
assert score > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keyword_matching_no_overlap(self) -> None:
|
|
"""Test keyword matching with no query overlap."""
|
|
scorer = RelevanceScorer()
|
|
|
|
context = TaskContext(
|
|
content="Implement database migration",
|
|
source="task",
|
|
)
|
|
|
|
score = await scorer.score(context, "xyz abc 123")
|
|
assert score == 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_keyword_matching_full_overlap(self) -> None:
|
|
"""Test keyword matching with high overlap."""
|
|
scorer = RelevanceScorer(keyword_fallback_weight=1.0)
|
|
|
|
context = TaskContext(
|
|
content="python programming language",
|
|
source="task",
|
|
)
|
|
|
|
score = await scorer.score(context, "python programming")
|
|
# Should have high score due to keyword overlap
|
|
assert score > 0.5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_mcp_success(self) -> None:
|
|
"""Test scoring with MCP semantic similarity."""
|
|
mock_mcp = MagicMock()
|
|
mock_result = MagicMock()
|
|
mock_result.success = True
|
|
mock_result.data = {"similarity": 0.75}
|
|
mock_mcp.call_tool = AsyncMock(return_value=mock_result)
|
|
|
|
scorer = RelevanceScorer(mcp_manager=mock_mcp)
|
|
|
|
context = TaskContext(
|
|
content="Test content",
|
|
source="task",
|
|
)
|
|
|
|
score = await scorer.score(context, "test query")
|
|
assert score == 0.75
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_mcp_failure_fallback(self) -> None:
|
|
"""Test fallback when MCP fails."""
|
|
mock_mcp = MagicMock()
|
|
mock_mcp.call_tool = AsyncMock(side_effect=Exception("Connection failed"))
|
|
|
|
scorer = RelevanceScorer(mcp_manager=mock_mcp, keyword_fallback_weight=0.5)
|
|
|
|
context = TaskContext(
|
|
content="Python programming code",
|
|
source="task",
|
|
)
|
|
|
|
# Should fall back to keyword matching
|
|
score = await scorer.score(context, "Python code")
|
|
assert score > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_batch(self) -> None:
|
|
"""Test batch scoring."""
|
|
scorer = RelevanceScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(content="Python", source="1", relevance_score=0.8),
|
|
KnowledgeContext(content="Java", source="2", relevance_score=0.6),
|
|
KnowledgeContext(content="Go", source="3", relevance_score=0.9),
|
|
]
|
|
|
|
scores = await scorer.score_batch(contexts, "test")
|
|
assert len(scores) == 3
|
|
assert scores[0] == 0.8
|
|
assert scores[1] == 0.6
|
|
assert scores[2] == 0.9
|
|
|
|
def test_set_mcp_manager(self) -> None:
|
|
"""Test setting MCP manager."""
|
|
scorer = RelevanceScorer()
|
|
assert scorer._mcp is None
|
|
|
|
mock_mcp = MagicMock()
|
|
scorer.set_mcp_manager(mock_mcp)
|
|
assert scorer._mcp is mock_mcp
|
|
|
|
|
|
class TestRecencyScorer:
|
|
"""Tests for RecencyScorer."""
|
|
|
|
def test_creation(self) -> None:
|
|
"""Test scorer creation."""
|
|
scorer = RecencyScorer()
|
|
assert scorer.weight == 1.0
|
|
assert scorer._half_life_hours == 24.0
|
|
|
|
def test_creation_with_custom_half_life(self) -> None:
|
|
"""Test scorer creation with custom half-life."""
|
|
scorer = RecencyScorer(half_life_hours=12.0)
|
|
assert scorer._half_life_hours == 12.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_recent_context(self) -> None:
|
|
"""Test scoring a very recent context."""
|
|
scorer = RecencyScorer(half_life_hours=24.0)
|
|
now = datetime.now(UTC)
|
|
|
|
context = TaskContext(
|
|
content="Recent task",
|
|
source="task",
|
|
timestamp=now,
|
|
)
|
|
|
|
score = await scorer.score(context, "query", reference_time=now)
|
|
# Very recent should have score near 1.0
|
|
assert score > 0.99
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_at_half_life(self) -> None:
|
|
"""Test scoring at exactly half-life age."""
|
|
scorer = RecencyScorer(half_life_hours=24.0)
|
|
now = datetime.now(UTC)
|
|
half_life_ago = now - timedelta(hours=24)
|
|
|
|
context = TaskContext(
|
|
content="Day old task",
|
|
source="task",
|
|
timestamp=half_life_ago,
|
|
)
|
|
|
|
score = await scorer.score(context, "query", reference_time=now)
|
|
# At half-life, score should be ~0.5
|
|
assert 0.49 <= score <= 0.51
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_old_context(self) -> None:
|
|
"""Test scoring a very old context."""
|
|
scorer = RecencyScorer(half_life_hours=24.0)
|
|
now = datetime.now(UTC)
|
|
week_ago = now - timedelta(days=7)
|
|
|
|
context = TaskContext(
|
|
content="Week old task",
|
|
source="task",
|
|
timestamp=week_ago,
|
|
)
|
|
|
|
score = await scorer.score(context, "query", reference_time=now)
|
|
# 7 days with 24h half-life = very low score
|
|
assert score < 0.01
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_type_specific_half_lives(self) -> None:
|
|
"""Test that different context types have different half-lives."""
|
|
scorer = RecencyScorer()
|
|
now = datetime.now(UTC)
|
|
one_hour_ago = now - timedelta(hours=1)
|
|
|
|
# Conversation has 1 hour half-life by default
|
|
conv_context = ConversationContext(
|
|
content="Hello",
|
|
source="chat",
|
|
role=MessageRole.USER,
|
|
timestamp=one_hour_ago,
|
|
)
|
|
|
|
# Knowledge has 168 hour (1 week) half-life by default
|
|
knowledge_context = KnowledgeContext(
|
|
content="Documentation",
|
|
source="docs",
|
|
timestamp=one_hour_ago,
|
|
)
|
|
|
|
conv_score = await scorer.score(conv_context, "query", reference_time=now)
|
|
knowledge_score = await scorer.score(
|
|
knowledge_context, "query", reference_time=now
|
|
)
|
|
|
|
# Conversation should decay much faster
|
|
assert conv_score < knowledge_score
|
|
|
|
def test_get_half_life(self) -> None:
|
|
"""Test getting half-life for context type."""
|
|
scorer = RecencyScorer()
|
|
|
|
assert scorer.get_half_life(ContextType.CONVERSATION) == 1.0
|
|
assert scorer.get_half_life(ContextType.KNOWLEDGE) == 168.0
|
|
assert scorer.get_half_life(ContextType.SYSTEM) == 720.0
|
|
|
|
def test_set_half_life(self) -> None:
|
|
"""Test setting custom half-life."""
|
|
scorer = RecencyScorer()
|
|
|
|
scorer.set_half_life(ContextType.TASK, 48.0)
|
|
assert scorer.get_half_life(ContextType.TASK) == 48.0
|
|
|
|
def test_set_half_life_invalid(self) -> None:
|
|
"""Test setting invalid half-life."""
|
|
scorer = RecencyScorer()
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.set_half_life(ContextType.TASK, 0)
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.set_half_life(ContextType.TASK, -1)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_batch(self) -> None:
|
|
"""Test batch scoring."""
|
|
scorer = RecencyScorer()
|
|
now = datetime.now(UTC)
|
|
|
|
contexts = [
|
|
TaskContext(content="1", source="t", timestamp=now),
|
|
TaskContext(content="2", source="t", timestamp=now - timedelta(hours=24)),
|
|
TaskContext(content="3", source="t", timestamp=now - timedelta(hours=48)),
|
|
]
|
|
|
|
scores = await scorer.score_batch(contexts, "query", reference_time=now)
|
|
assert len(scores) == 3
|
|
# Scores should be in descending order (more recent = higher)
|
|
assert scores[0] > scores[1] > scores[2]
|
|
|
|
|
|
class TestPriorityScorer:
|
|
"""Tests for PriorityScorer."""
|
|
|
|
def test_creation(self) -> None:
|
|
"""Test scorer creation."""
|
|
scorer = PriorityScorer()
|
|
assert scorer.weight == 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_critical_priority(self) -> None:
|
|
"""Test scoring CRITICAL priority context."""
|
|
scorer = PriorityScorer()
|
|
|
|
context = SystemContext(
|
|
content="Critical system prompt",
|
|
source="system",
|
|
priority=ContextPriority.CRITICAL.value,
|
|
)
|
|
|
|
score = await scorer.score(context, "query")
|
|
# CRITICAL (100) + type bonus should be > 1.0, normalized to 1.0
|
|
assert score == 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_normal_priority(self) -> None:
|
|
"""Test scoring NORMAL priority context."""
|
|
scorer = PriorityScorer()
|
|
|
|
context = TaskContext(
|
|
content="Normal task",
|
|
source="task",
|
|
priority=ContextPriority.NORMAL.value,
|
|
)
|
|
|
|
score = await scorer.score(context, "query")
|
|
# NORMAL (50) = 0.5, plus TASK bonus (0.15) = 0.65
|
|
assert 0.6 <= score <= 0.7
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_low_priority(self) -> None:
|
|
"""Test scoring LOW priority context."""
|
|
scorer = PriorityScorer()
|
|
|
|
context = KnowledgeContext(
|
|
content="Low priority knowledge",
|
|
source="docs",
|
|
priority=ContextPriority.LOW.value,
|
|
)
|
|
|
|
score = await scorer.score(context, "query")
|
|
# LOW (20) = 0.2, no bonus for KNOWLEDGE
|
|
assert 0.15 <= score <= 0.25
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_type_bonuses(self) -> None:
|
|
"""Test type-specific priority bonuses."""
|
|
scorer = PriorityScorer()
|
|
|
|
# All with same base priority
|
|
system_ctx = SystemContext(
|
|
content="System",
|
|
source="system",
|
|
priority=50,
|
|
)
|
|
task_ctx = TaskContext(
|
|
content="Task",
|
|
source="task",
|
|
priority=50,
|
|
)
|
|
knowledge_ctx = KnowledgeContext(
|
|
content="Knowledge",
|
|
source="docs",
|
|
priority=50,
|
|
)
|
|
|
|
system_score = await scorer.score(system_ctx, "query")
|
|
task_score = await scorer.score(task_ctx, "query")
|
|
knowledge_score = await scorer.score(knowledge_ctx, "query")
|
|
|
|
# System has highest bonus (0.2), task next (0.15), knowledge has none
|
|
assert system_score > task_score > knowledge_score
|
|
|
|
def test_get_type_bonus(self) -> None:
|
|
"""Test getting type bonus."""
|
|
scorer = PriorityScorer()
|
|
|
|
assert scorer.get_type_bonus(ContextType.SYSTEM) == 0.2
|
|
assert scorer.get_type_bonus(ContextType.TASK) == 0.15
|
|
assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.0
|
|
|
|
def test_set_type_bonus(self) -> None:
|
|
"""Test setting custom type bonus."""
|
|
scorer = PriorityScorer()
|
|
|
|
scorer.set_type_bonus(ContextType.KNOWLEDGE, 0.1)
|
|
assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.1
|
|
|
|
def test_set_type_bonus_invalid(self) -> None:
|
|
"""Test setting invalid type bonus."""
|
|
scorer = PriorityScorer()
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.set_type_bonus(ContextType.KNOWLEDGE, 1.5)
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.set_type_bonus(ContextType.KNOWLEDGE, -0.1)
|
|
|
|
|
|
class TestCompositeScorer:
|
|
"""Tests for CompositeScorer."""
|
|
|
|
def test_creation(self) -> None:
|
|
"""Test scorer creation with default weights."""
|
|
scorer = CompositeScorer()
|
|
|
|
weights = scorer.weights
|
|
assert weights["relevance"] == 0.5
|
|
assert weights["recency"] == 0.3
|
|
assert weights["priority"] == 0.2
|
|
|
|
def test_creation_with_custom_weights(self) -> None:
|
|
"""Test scorer creation with custom weights."""
|
|
scorer = CompositeScorer(
|
|
relevance_weight=0.6,
|
|
recency_weight=0.2,
|
|
priority_weight=0.2,
|
|
)
|
|
|
|
weights = scorer.weights
|
|
assert weights["relevance"] == 0.6
|
|
assert weights["recency"] == 0.2
|
|
assert weights["priority"] == 0.2
|
|
|
|
def test_update_weights(self) -> None:
|
|
"""Test updating weights."""
|
|
scorer = CompositeScorer()
|
|
|
|
scorer.update_weights(relevance=0.7, recency=0.2, priority=0.1)
|
|
|
|
weights = scorer.weights
|
|
assert weights["relevance"] == 0.7
|
|
assert weights["recency"] == 0.2
|
|
assert weights["priority"] == 0.1
|
|
|
|
def test_update_weights_partial(self) -> None:
|
|
"""Test partially updating weights."""
|
|
scorer = CompositeScorer()
|
|
original_recency = scorer.weights["recency"]
|
|
|
|
scorer.update_weights(relevance=0.8)
|
|
|
|
assert scorer.weights["relevance"] == 0.8
|
|
assert scorer.weights["recency"] == original_recency
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_basic(self) -> None:
|
|
"""Test basic composite scoring."""
|
|
scorer = CompositeScorer()
|
|
|
|
context = KnowledgeContext(
|
|
content="Test content",
|
|
source="docs",
|
|
relevance_score=0.8,
|
|
timestamp=datetime.now(UTC),
|
|
priority=ContextPriority.NORMAL.value,
|
|
)
|
|
|
|
score = await scorer.score(context, "test query")
|
|
assert 0.0 <= score <= 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_details(self) -> None:
|
|
"""Test scoring with detailed breakdown."""
|
|
scorer = CompositeScorer()
|
|
|
|
context = KnowledgeContext(
|
|
content="Test content",
|
|
source="docs",
|
|
relevance_score=0.8,
|
|
timestamp=datetime.now(UTC),
|
|
priority=ContextPriority.HIGH.value,
|
|
)
|
|
|
|
scored = await scorer.score_with_details(context, "test query")
|
|
|
|
assert isinstance(scored, ScoredContext)
|
|
assert scored.context is context
|
|
assert 0.0 <= scored.composite_score <= 1.0
|
|
assert scored.relevance_score == 0.8
|
|
assert scored.recency_score > 0.9 # Very recent
|
|
assert scored.priority_score > 0.5 # HIGH priority
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_not_cached_on_context(self) -> None:
|
|
"""Test that scores are NOT cached on the context.
|
|
|
|
Scores should not be cached on the context because they are query-dependent.
|
|
Different queries would get incorrect cached scores if we cached on the context.
|
|
"""
|
|
scorer = CompositeScorer()
|
|
|
|
context = KnowledgeContext(
|
|
content="Test",
|
|
source="docs",
|
|
relevance_score=0.5,
|
|
)
|
|
|
|
# After scoring, context._score should remain None
|
|
# (we don't cache on context because scores are query-dependent)
|
|
await scorer.score(context, "query")
|
|
# The scorer should compute fresh scores each time
|
|
# rather than caching on the context object
|
|
|
|
# Score again with different query - should compute fresh score
|
|
score1 = await scorer.score(context, "query 1")
|
|
score2 = await scorer.score(context, "query 2")
|
|
# Both should be valid scores (not necessarily equal since queries differ)
|
|
assert 0.0 <= score1 <= 1.0
|
|
assert 0.0 <= score2 <= 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_batch(self) -> None:
|
|
"""Test batch scoring."""
|
|
scorer = CompositeScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(
|
|
content="High relevance",
|
|
source="docs",
|
|
relevance_score=0.9,
|
|
),
|
|
KnowledgeContext(
|
|
content="Low relevance",
|
|
source="docs",
|
|
relevance_score=0.2,
|
|
),
|
|
]
|
|
|
|
scored = await scorer.score_batch(contexts, "query")
|
|
assert len(scored) == 2
|
|
assert scored[0].relevance_score > scored[1].relevance_score
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rank(self) -> None:
|
|
"""Test ranking contexts."""
|
|
scorer = CompositeScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(content="Low", source="docs", relevance_score=0.2),
|
|
KnowledgeContext(content="High", source="docs", relevance_score=0.9),
|
|
KnowledgeContext(content="Medium", source="docs", relevance_score=0.5),
|
|
]
|
|
|
|
ranked = await scorer.rank(contexts, "query")
|
|
|
|
# Should be sorted by score (highest first)
|
|
assert len(ranked) == 3
|
|
assert ranked[0].relevance_score == 0.9
|
|
assert ranked[1].relevance_score == 0.5
|
|
assert ranked[2].relevance_score == 0.2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rank_with_limit(self) -> None:
|
|
"""Test ranking with limit."""
|
|
scorer = CompositeScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(content=str(i), source="docs", relevance_score=i / 10)
|
|
for i in range(10)
|
|
]
|
|
|
|
ranked = await scorer.rank(contexts, "query", limit=3)
|
|
assert len(ranked) == 3
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rank_with_min_score(self) -> None:
|
|
"""Test ranking with minimum score threshold."""
|
|
scorer = CompositeScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(content="Low", source="docs", relevance_score=0.1),
|
|
KnowledgeContext(content="High", source="docs", relevance_score=0.9),
|
|
]
|
|
|
|
ranked = await scorer.rank(contexts, "query", min_score=0.5)
|
|
|
|
# Only the high relevance context should pass the threshold
|
|
assert len(ranked) <= 2 # Could be 1 if min_score filters
|
|
|
|
def test_set_mcp_manager(self) -> None:
|
|
"""Test setting MCP manager."""
|
|
scorer = CompositeScorer()
|
|
mock_mcp = MagicMock()
|
|
|
|
scorer.set_mcp_manager(mock_mcp)
|
|
assert scorer._relevance_scorer._mcp is mock_mcp
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_scoring_same_context_no_race(self) -> None:
|
|
"""Test that concurrent scoring of the same context doesn't cause race conditions.
|
|
|
|
This verifies that the per-context locking mechanism prevents the same context
|
|
from being scored multiple times when scored concurrently.
|
|
"""
|
|
import asyncio
|
|
|
|
# Use scorer with recency_weight=0 to eliminate time-dependent variation
|
|
# (recency scores change as time passes between calls)
|
|
scorer = CompositeScorer(
|
|
relevance_weight=0.5,
|
|
recency_weight=0.0, # Disable recency to get deterministic results
|
|
priority_weight=0.5,
|
|
)
|
|
|
|
# Create a single context that will be scored multiple times concurrently
|
|
context = KnowledgeContext(
|
|
content="Test content for race condition test",
|
|
source="docs",
|
|
relevance_score=0.75,
|
|
)
|
|
|
|
# Score the same context many times in parallel
|
|
num_concurrent = 50
|
|
tasks = [scorer.score(context, "test query") for _ in range(num_concurrent)]
|
|
scores = await asyncio.gather(*tasks)
|
|
|
|
# All scores should be identical (deterministic scoring without recency)
|
|
assert all(s == scores[0] for s in scores)
|
|
# Note: We don't cache _score on context because scores are query-dependent
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_scoring_different_contexts(self) -> None:
|
|
"""Test that concurrent scoring of different contexts works correctly.
|
|
|
|
Different contexts should not interfere with each other during parallel scoring.
|
|
"""
|
|
import asyncio
|
|
|
|
scorer = CompositeScorer()
|
|
|
|
# Create many different contexts
|
|
contexts = [
|
|
KnowledgeContext(
|
|
content=f"Test content {i}",
|
|
source="docs",
|
|
relevance_score=i / 10,
|
|
)
|
|
for i in range(10)
|
|
]
|
|
|
|
# Score all contexts concurrently
|
|
tasks = [scorer.score(ctx, "test query") for ctx in contexts]
|
|
scores = await asyncio.gather(*tasks)
|
|
|
|
# Each context should have a different score based on its relevance
|
|
assert len(set(scores)) > 1 # Not all the same
|
|
# Note: We don't cache _score on context because scores are query-dependent
|
|
|
|
|
|
class TestScoredContext:
|
|
"""Tests for ScoredContext dataclass."""
|
|
|
|
def test_creation(self) -> None:
|
|
"""Test ScoredContext creation."""
|
|
context = TaskContext(content="Test", source="task")
|
|
scored = ScoredContext(
|
|
context=context,
|
|
composite_score=0.75,
|
|
relevance_score=0.8,
|
|
recency_score=0.7,
|
|
priority_score=0.5,
|
|
)
|
|
|
|
assert scored.context is context
|
|
assert scored.composite_score == 0.75
|
|
|
|
def test_comparison_operators(self) -> None:
|
|
"""Test comparison operators for sorting."""
|
|
ctx1 = TaskContext(content="1", source="task")
|
|
ctx2 = TaskContext(content="2", source="task")
|
|
|
|
scored1 = ScoredContext(context=ctx1, composite_score=0.5)
|
|
scored2 = ScoredContext(context=ctx2, composite_score=0.8)
|
|
|
|
assert scored1 < scored2
|
|
assert scored2 > scored1
|
|
|
|
def test_sorting(self) -> None:
|
|
"""Test sorting scored contexts."""
|
|
contexts = [
|
|
ScoredContext(
|
|
context=TaskContext(content="Low", source="task"),
|
|
composite_score=0.3,
|
|
),
|
|
ScoredContext(
|
|
context=TaskContext(content="High", source="task"),
|
|
composite_score=0.9,
|
|
),
|
|
ScoredContext(
|
|
context=TaskContext(content="Medium", source="task"),
|
|
composite_score=0.6,
|
|
),
|
|
]
|
|
|
|
sorted_contexts = sorted(contexts, reverse=True)
|
|
|
|
assert sorted_contexts[0].composite_score == 0.9
|
|
assert sorted_contexts[1].composite_score == 0.6
|
|
assert sorted_contexts[2].composite_score == 0.3
|
|
|
|
|
|
class TestBaseScorer:
|
|
"""Tests for BaseScorer abstract class."""
|
|
|
|
def test_weight_property(self) -> None:
|
|
"""Test weight property."""
|
|
# Use a concrete implementation
|
|
scorer = RelevanceScorer(weight=0.7)
|
|
assert scorer.weight == 0.7
|
|
|
|
def test_weight_setter_valid(self) -> None:
|
|
"""Test weight setter with valid values."""
|
|
scorer = RelevanceScorer()
|
|
scorer.weight = 0.5
|
|
assert scorer.weight == 0.5
|
|
|
|
def test_weight_setter_invalid(self) -> None:
|
|
"""Test weight setter with invalid values."""
|
|
scorer = RelevanceScorer()
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.weight = -0.1
|
|
|
|
with pytest.raises(ValueError):
|
|
scorer.weight = 1.5
|
|
|
|
def test_normalize_score(self) -> None:
|
|
"""Test score normalization."""
|
|
scorer = RelevanceScorer()
|
|
|
|
# Normal range
|
|
assert scorer.normalize_score(0.5) == 0.5
|
|
|
|
# Below 0
|
|
assert scorer.normalize_score(-0.5) == 0.0
|
|
|
|
# Above 1
|
|
assert scorer.normalize_score(1.5) == 1.0
|
|
|
|
# Boundaries
|
|
assert scorer.normalize_score(0.0) == 0.0
|
|
assert scorer.normalize_score(1.0) == 1.0
|
|
|
|
|
|
class TestCompositeScorerEdgeCases:
|
|
"""Tests for CompositeScorer edge cases and lock management."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_with_zero_weights(self) -> None:
|
|
"""Test scoring when all weights are zero."""
|
|
scorer = CompositeScorer(
|
|
relevance_weight=0.0,
|
|
recency_weight=0.0,
|
|
priority_weight=0.0,
|
|
)
|
|
|
|
context = KnowledgeContext(
|
|
content="Test content",
|
|
source="docs",
|
|
relevance_score=0.8,
|
|
)
|
|
|
|
# Should return 0.0 when total weight is 0
|
|
score = await scorer.score(context, "test query")
|
|
assert score == 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_score_batch_sequential(self) -> None:
|
|
"""Test batch scoring in sequential mode (parallel=False)."""
|
|
scorer = CompositeScorer()
|
|
|
|
contexts = [
|
|
KnowledgeContext(
|
|
content="Content 1",
|
|
source="docs",
|
|
relevance_score=0.8,
|
|
),
|
|
KnowledgeContext(
|
|
content="Content 2",
|
|
source="docs",
|
|
relevance_score=0.5,
|
|
),
|
|
]
|
|
|
|
# Use parallel=False to cover the sequential path
|
|
scored = await scorer.score_batch(contexts, "query", parallel=False)
|
|
|
|
assert len(scored) == 2
|
|
assert scored[0].relevance_score == 0.8
|
|
assert scored[1].relevance_score == 0.5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_lock_fast_path_reuse(self) -> None:
|
|
"""Test that existing locks are reused via fast path."""
|
|
scorer = CompositeScorer()
|
|
|
|
context = KnowledgeContext(
|
|
content="Test",
|
|
source="docs",
|
|
relevance_score=0.5,
|
|
)
|
|
|
|
# First access creates the lock
|
|
lock1 = await scorer._get_context_lock(context.id)
|
|
|
|
# Second access should hit the fast path (lock exists in dict)
|
|
lock2 = await scorer._get_context_lock(context.id)
|
|
|
|
assert lock2 is lock1 # Same lock object returned
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_lock_cleanup_when_limit_reached(self) -> None:
|
|
"""Test that old locks are cleaned up when limit is reached."""
|
|
import time
|
|
|
|
# Create scorer with very low max_locks to trigger cleanup
|
|
scorer = CompositeScorer()
|
|
scorer._max_locks = 3
|
|
scorer._lock_ttl = 0.1 # 100ms TTL
|
|
|
|
# Create locks for several context IDs
|
|
context_ids = [f"ctx-{i}" for i in range(5)]
|
|
|
|
# Get locks for first 3 contexts (fill up to limit)
|
|
for ctx_id in context_ids[:3]:
|
|
await scorer._get_context_lock(ctx_id)
|
|
|
|
# Wait for TTL to expire
|
|
time.sleep(0.15)
|
|
|
|
# Getting a lock for a new context should trigger cleanup
|
|
await scorer._get_context_lock(context_ids[3])
|
|
|
|
# Some old locks should have been cleaned up
|
|
# The exact number depends on cleanup logic
|
|
assert len(scorer._context_locks) <= scorer._max_locks + 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_lock_cleanup_preserves_held_locks(self) -> None:
|
|
"""Test that cleanup doesn't remove locks that are currently held."""
|
|
import time
|
|
|
|
scorer = CompositeScorer()
|
|
scorer._max_locks = 2
|
|
scorer._lock_ttl = 0.05 # 50ms TTL
|
|
|
|
# Get and hold lock1
|
|
lock1 = await scorer._get_context_lock("ctx-1")
|
|
async with lock1:
|
|
# While holding lock1, add more locks
|
|
await scorer._get_context_lock("ctx-2")
|
|
time.sleep(0.1) # Let TTL expire
|
|
# Adding another should trigger cleanup
|
|
await scorer._get_context_lock("ctx-3")
|
|
|
|
# lock1 should still exist (it's held)
|
|
assert any(lock is lock1 for lock, _ in scorer._context_locks.values())
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_lock_acquisition_double_check(self) -> None:
|
|
"""Test that concurrent lock acquisition uses double-check pattern."""
|
|
import asyncio
|
|
|
|
scorer = CompositeScorer()
|
|
|
|
context_id = "test-context-id"
|
|
|
|
# Simulate concurrent lock acquisition
|
|
async def get_lock():
|
|
return await scorer._get_context_lock(context_id)
|
|
|
|
locks = await asyncio.gather(*[get_lock() for _ in range(10)])
|
|
|
|
# All should get the same lock (double-check pattern ensures this)
|
|
assert all(lock is locks[0] for lock in locks)
|