syndarix/backend/tests/services/context/test_scoring.py

"""Tests for context scoring module."""

from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, MagicMock

import pytest

from app.services.context.scoring import (
    CompositeScorer,
    PriorityScorer,
    RecencyScorer,
    RelevanceScorer,
    ScoredContext,
)
from app.services.context.types import (
    ContextPriority,
    ContextType,
    ConversationContext,
    KnowledgeContext,
    MessageRole,
    SystemContext,
    TaskContext,
)


class TestRelevanceScorer:
    """Tests for RelevanceScorer."""

    def test_creation(self) -> None:
        """Test scorer creation."""
        scorer = RelevanceScorer()
        assert scorer.weight == 1.0

    def test_creation_with_weight(self) -> None:
        """Test scorer creation with custom weight."""
        scorer = RelevanceScorer(weight=0.5)
        assert scorer.weight == 0.5

    @pytest.mark.asyncio
    async def test_score_with_precomputed_relevance(self) -> None:
        """Test scoring with pre-computed relevance score."""
        scorer = RelevanceScorer()

        # KnowledgeContext with pre-computed score
        context = KnowledgeContext(
            content="Test content about Python",
            source="docs/python.md",
            relevance_score=0.85,
        )

        score = await scorer.score(context, "Python programming")
        assert score == 0.85

    @pytest.mark.asyncio
    async def test_score_with_metadata_score(self) -> None:
        """Test scoring with metadata-provided score."""
        scorer = RelevanceScorer()

        context = SystemContext(
            content="System prompt",
            source="system",
            metadata={"relevance_score": 0.9},
        )

        score = await scorer.score(context, "anything")
        assert score == 0.9

    @pytest.mark.asyncio
    async def test_score_fallback_to_keyword_matching(self) -> None:
        """Test fallback to keyword matching when no score available."""
        scorer = RelevanceScorer(keyword_fallback_weight=0.5)

        context = TaskContext(
            content="Implement authentication with JWT tokens",
            source="task",
        )

        # Query has matching keywords
        score = await scorer.score(context, "JWT authentication")
        assert score > 0

    @pytest.mark.asyncio
    async def test_keyword_matching_no_overlap(self) -> None:
        """Test keyword matching with no query overlap."""
        scorer = RelevanceScorer()

        context = TaskContext(
            content="Implement database migration",
            source="task",
        )

        score = await scorer.score(context, "xyz abc 123")
        assert score == 0.0

    @pytest.mark.asyncio
    async def test_keyword_matching_full_overlap(self) -> None:
        """Test keyword matching with high overlap."""
        scorer = RelevanceScorer(keyword_fallback_weight=1.0)

        context = TaskContext(
            content="python programming language",
            source="task",
        )

        score = await scorer.score(context, "python programming")
        # Should have high score due to keyword overlap
        assert score > 0.5

    @pytest.mark.asyncio
    async def test_score_with_mcp_success(self) -> None:
        """Test scoring with MCP semantic similarity."""
        mock_mcp = MagicMock()
        mock_result = MagicMock()
        mock_result.success = True
        mock_result.data = {"similarity": 0.75}
        mock_mcp.call_tool = AsyncMock(return_value=mock_result)

        scorer = RelevanceScorer(mcp_manager=mock_mcp)

        context = TaskContext(
            content="Test content",
            source="task",
        )

        score = await scorer.score(context, "test query")
        assert score == 0.75

    @pytest.mark.asyncio
    async def test_score_with_mcp_failure_fallback(self) -> None:
        """Test fallback when MCP fails."""
        mock_mcp = MagicMock()
        mock_mcp.call_tool = AsyncMock(side_effect=Exception("Connection failed"))

        scorer = RelevanceScorer(mcp_manager=mock_mcp, keyword_fallback_weight=0.5)

        context = TaskContext(
            content="Python programming code",
            source="task",
        )

        # Should fall back to keyword matching
        score = await scorer.score(context, "Python code")
        assert score > 0

    @pytest.mark.asyncio
    async def test_score_batch(self) -> None:
        """Test batch scoring."""
        scorer = RelevanceScorer()

        contexts = [
            KnowledgeContext(content="Python", source="1", relevance_score=0.8),
            KnowledgeContext(content="Java", source="2", relevance_score=0.6),
            KnowledgeContext(content="Go", source="3", relevance_score=0.9),
        ]

        scores = await scorer.score_batch(contexts, "test")
        assert len(scores) == 3
        assert scores[0] == 0.8
        assert scores[1] == 0.6
        assert scores[2] == 0.9

    def test_set_mcp_manager(self) -> None:
        """Test setting MCP manager."""
        scorer = RelevanceScorer()
        assert scorer._mcp is None

        mock_mcp = MagicMock()
        scorer.set_mcp_manager(mock_mcp)
        assert scorer._mcp is mock_mcp


class TestRecencyScorer:
    """Tests for RecencyScorer."""

    def test_creation(self) -> None:
        """Test scorer creation."""
        scorer = RecencyScorer()
        assert scorer.weight == 1.0
        assert scorer._half_life_hours == 24.0

    def test_creation_with_custom_half_life(self) -> None:
        """Test scorer creation with custom half-life."""
        scorer = RecencyScorer(half_life_hours=12.0)
        assert scorer._half_life_hours == 12.0

    @pytest.mark.asyncio
    async def test_score_recent_context(self) -> None:
        """Test scoring a very recent context."""
        scorer = RecencyScorer(half_life_hours=24.0)
        now = datetime.now(UTC)

        context = TaskContext(
            content="Recent task",
            source="task",
            timestamp=now,
        )

        score = await scorer.score(context, "query", reference_time=now)
        # Very recent should have score near 1.0
        assert score > 0.99

    @pytest.mark.asyncio
    async def test_score_at_half_life(self) -> None:
        """Test scoring at exactly half-life age."""
        scorer = RecencyScorer(half_life_hours=24.0)
        now = datetime.now(UTC)
        half_life_ago = now - timedelta(hours=24)

        context = TaskContext(
            content="Day old task",
            source="task",
            timestamp=half_life_ago,
        )

        score = await scorer.score(context, "query", reference_time=now)
        # At half-life, score should be ~0.5
        assert 0.49 <= score <= 0.51

    @pytest.mark.asyncio
    async def test_score_old_context(self) -> None:
        """Test scoring a very old context."""
        scorer = RecencyScorer(half_life_hours=24.0)
        now = datetime.now(UTC)
        week_ago = now - timedelta(days=7)

        context = TaskContext(
            content="Week old task",
            source="task",
            timestamp=week_ago,
        )

        score = await scorer.score(context, "query", reference_time=now)
        # 7 days with 24h half-life = very low score
        assert score < 0.01

    @pytest.mark.asyncio
    async def test_type_specific_half_lives(self) -> None:
        """Test that different context types have different half-lives."""
        scorer = RecencyScorer()
        now = datetime.now(UTC)
        one_hour_ago = now - timedelta(hours=1)

        # Conversation has 1 hour half-life by default
        conv_context = ConversationContext(
            content="Hello",
            source="chat",
            role=MessageRole.USER,
            timestamp=one_hour_ago,
        )

        # Knowledge has 168 hour (1 week) half-life by default
        knowledge_context = KnowledgeContext(
            content="Documentation",
            source="docs",
            timestamp=one_hour_ago,
        )

        conv_score = await scorer.score(conv_context, "query", reference_time=now)
        knowledge_score = await scorer.score(
            knowledge_context, "query", reference_time=now
        )

        # Conversation should decay much faster
        assert conv_score < knowledge_score

    def test_get_half_life(self) -> None:
        """Test getting half-life for context type."""
        scorer = RecencyScorer()

        assert scorer.get_half_life(ContextType.CONVERSATION) == 1.0
        assert scorer.get_half_life(ContextType.KNOWLEDGE) == 168.0
        assert scorer.get_half_life(ContextType.SYSTEM) == 720.0

    def test_set_half_life(self) -> None:
        """Test setting custom half-life."""
        scorer = RecencyScorer()

        scorer.set_half_life(ContextType.TASK, 48.0)
        assert scorer.get_half_life(ContextType.TASK) == 48.0

    def test_set_half_life_invalid(self) -> None:
        """Test setting invalid half-life."""
        scorer = RecencyScorer()

        with pytest.raises(ValueError):
            scorer.set_half_life(ContextType.TASK, 0)

        with pytest.raises(ValueError):
            scorer.set_half_life(ContextType.TASK, -1)

    @pytest.mark.asyncio
    async def test_score_batch(self) -> None:
        """Test batch scoring."""
        scorer = RecencyScorer()
        now = datetime.now(UTC)

        contexts = [
            TaskContext(content="1", source="t", timestamp=now),
            TaskContext(content="2", source="t", timestamp=now - timedelta(hours=24)),
            TaskContext(content="3", source="t", timestamp=now - timedelta(hours=48)),
        ]

        scores = await scorer.score_batch(contexts, "query", reference_time=now)
        assert len(scores) == 3
        # Scores should be in descending order (more recent = higher)
        assert scores[0] > scores[1] > scores[2]


class TestPriorityScorer:
    """Tests for PriorityScorer."""

    def test_creation(self) -> None:
        """Test scorer creation."""
        scorer = PriorityScorer()
        assert scorer.weight == 1.0

    @pytest.mark.asyncio
    async def test_score_critical_priority(self) -> None:
        """Test scoring CRITICAL priority context."""
        scorer = PriorityScorer()

        context = SystemContext(
            content="Critical system prompt",
            source="system",
            priority=ContextPriority.CRITICAL.value,
        )

        score = await scorer.score(context, "query")
        # CRITICAL (100) + type bonus should be > 1.0, normalized to 1.0
        assert score == 1.0

    @pytest.mark.asyncio
    async def test_score_normal_priority(self) -> None:
        """Test scoring NORMAL priority context."""
        scorer = PriorityScorer()

        context = TaskContext(
            content="Normal task",
            source="task",
            priority=ContextPriority.NORMAL.value,
        )

        score = await scorer.score(context, "query")
        # NORMAL (50) = 0.5, plus TASK bonus (0.15) = 0.65
        assert 0.6 <= score <= 0.7

    @pytest.mark.asyncio
    async def test_score_low_priority(self) -> None:
        """Test scoring LOW priority context."""
        scorer = PriorityScorer()

        context = KnowledgeContext(
            content="Low priority knowledge",
            source="docs",
            priority=ContextPriority.LOW.value,
        )

        score = await scorer.score(context, "query")
        # LOW (20) = 0.2, no bonus for KNOWLEDGE
        assert 0.15 <= score <= 0.25

    @pytest.mark.asyncio
    async def test_type_bonuses(self) -> None:
        """Test type-specific priority bonuses."""
        scorer = PriorityScorer()

        # All with same base priority
        system_ctx = SystemContext(
            content="System",
            source="system",
            priority=50,
        )
        task_ctx = TaskContext(
            content="Task",
            source="task",
            priority=50,
        )
        knowledge_ctx = KnowledgeContext(
            content="Knowledge",
            source="docs",
            priority=50,
        )

        system_score = await scorer.score(system_ctx, "query")
        task_score = await scorer.score(task_ctx, "query")
        knowledge_score = await scorer.score(knowledge_ctx, "query")

        # System has highest bonus (0.2), task next (0.15), knowledge has none
        assert system_score > task_score > knowledge_score

    def test_get_type_bonus(self) -> None:
        """Test getting type bonus."""
        scorer = PriorityScorer()

        assert scorer.get_type_bonus(ContextType.SYSTEM) == 0.2
        assert scorer.get_type_bonus(ContextType.TASK) == 0.15
        assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.0

    def test_set_type_bonus(self) -> None:
        """Test setting custom type bonus."""
        scorer = PriorityScorer()

        scorer.set_type_bonus(ContextType.KNOWLEDGE, 0.1)
        assert scorer.get_type_bonus(ContextType.KNOWLEDGE) == 0.1

    def test_set_type_bonus_invalid(self) -> None:
        """Test setting invalid type bonus."""
        scorer = PriorityScorer()

        with pytest.raises(ValueError):
            scorer.set_type_bonus(ContextType.KNOWLEDGE, 1.5)

        with pytest.raises(ValueError):
            scorer.set_type_bonus(ContextType.KNOWLEDGE, -0.1)


class TestCompositeScorer:
    """Tests for CompositeScorer."""

    def test_creation(self) -> None:
        """Test scorer creation with default weights."""
        scorer = CompositeScorer()

        weights = scorer.weights
        assert weights["relevance"] == 0.5
        assert weights["recency"] == 0.3
        assert weights["priority"] == 0.2

    def test_creation_with_custom_weights(self) -> None:
        """Test scorer creation with custom weights."""
        scorer = CompositeScorer(
            relevance_weight=0.6,
            recency_weight=0.2,
            priority_weight=0.2,
        )

        weights = scorer.weights
        assert weights["relevance"] == 0.6
        assert weights["recency"] == 0.2
        assert weights["priority"] == 0.2

    def test_update_weights(self) -> None:
        """Test updating weights."""
        scorer = CompositeScorer()

        scorer.update_weights(relevance=0.7, recency=0.2, priority=0.1)

        weights = scorer.weights
        assert weights["relevance"] == 0.7
        assert weights["recency"] == 0.2
        assert weights["priority"] == 0.1

    def test_update_weights_partial(self) -> None:
        """Test partially updating weights."""
        scorer = CompositeScorer()
        original_recency = scorer.weights["recency"]

        scorer.update_weights(relevance=0.8)

        assert scorer.weights["relevance"] == 0.8
        assert scorer.weights["recency"] == original_recency

    @pytest.mark.asyncio
    async def test_score_basic(self) -> None:
        """Test basic composite scoring."""
        scorer = CompositeScorer()

        context = KnowledgeContext(
            content="Test content",
            source="docs",
            relevance_score=0.8,
            timestamp=datetime.now(UTC),
            priority=ContextPriority.NORMAL.value,
        )

        score = await scorer.score(context, "test query")
        assert 0.0 <= score <= 1.0

    @pytest.mark.asyncio
    async def test_score_with_details(self) -> None:
        """Test scoring with detailed breakdown."""
        scorer = CompositeScorer()

        context = KnowledgeContext(
            content="Test content",
            source="docs",
            relevance_score=0.8,
            timestamp=datetime.now(UTC),
            priority=ContextPriority.HIGH.value,
        )

        scored = await scorer.score_with_details(context, "test query")

        assert isinstance(scored, ScoredContext)
        assert scored.context is context
        assert 0.0 <= scored.composite_score <= 1.0
        assert scored.relevance_score == 0.8
        assert scored.recency_score > 0.9  # Very recent
        assert scored.priority_score > 0.5  # HIGH priority

    @pytest.mark.asyncio
    async def test_score_not_cached_on_context(self) -> None:
        """Test that scores are NOT cached on the context.

        Scores should not be cached on the context because they are query-dependent.
        Different queries would get incorrect cached scores if we cached on the context.
        """
        scorer = CompositeScorer()

        context = KnowledgeContext(
            content="Test",
            source="docs",
            relevance_score=0.5,
        )

        # After scoring, context._score should remain None
        # (we don't cache on context because scores are query-dependent)
        await scorer.score(context, "query")
        # The scorer should compute fresh scores each time
        # rather than caching on the context object

        # Score again with different query - should compute fresh score
        score1 = await scorer.score(context, "query 1")
        score2 = await scorer.score(context, "query 2")
        # Both should be valid scores (not necessarily equal since queries differ)
        assert 0.0 <= score1 <= 1.0
        assert 0.0 <= score2 <= 1.0

    @pytest.mark.asyncio
    async def test_score_batch(self) -> None:
        """Test batch scoring."""
        scorer = CompositeScorer()

        contexts = [
            KnowledgeContext(
                content="High relevance",
                source="docs",
                relevance_score=0.9,
            ),
            KnowledgeContext(
                content="Low relevance",
                source="docs",
                relevance_score=0.2,
            ),
        ]

        scored = await scorer.score_batch(contexts, "query")
        assert len(scored) == 2
        assert scored[0].relevance_score > scored[1].relevance_score

    @pytest.mark.asyncio
    async def test_rank(self) -> None:
        """Test ranking contexts."""
        scorer = CompositeScorer()

        contexts = [
            KnowledgeContext(content="Low", source="docs", relevance_score=0.2),
            KnowledgeContext(content="High", source="docs", relevance_score=0.9),
            KnowledgeContext(content="Medium", source="docs", relevance_score=0.5),
        ]

        ranked = await scorer.rank(contexts, "query")

        # Should be sorted by score (highest first)
        assert len(ranked) == 3
        assert ranked[0].relevance_score == 0.9
        assert ranked[1].relevance_score == 0.5
        assert ranked[2].relevance_score == 0.2

    @pytest.mark.asyncio
    async def test_rank_with_limit(self) -> None:
        """Test ranking with limit."""
        scorer = CompositeScorer()

        contexts = [
            KnowledgeContext(content=str(i), source="docs", relevance_score=i / 10)
            for i in range(10)
        ]

        ranked = await scorer.rank(contexts, "query", limit=3)
        assert len(ranked) == 3

    @pytest.mark.asyncio
    async def test_rank_with_min_score(self) -> None:
        """Test ranking with minimum score threshold."""
        scorer = CompositeScorer()

        contexts = [
            KnowledgeContext(content="Low", source="docs", relevance_score=0.1),
            KnowledgeContext(content="High", source="docs", relevance_score=0.9),
        ]

        ranked = await scorer.rank(contexts, "query", min_score=0.5)

        # Only the high relevance context should pass the threshold
        assert len(ranked) <= 2  # Could be 1 if min_score filters

    def test_set_mcp_manager(self) -> None:
        """Test setting MCP manager."""
        scorer = CompositeScorer()
        mock_mcp = MagicMock()

        scorer.set_mcp_manager(mock_mcp)
        assert scorer._relevance_scorer._mcp is mock_mcp

    @pytest.mark.asyncio
    async def test_concurrent_scoring_same_context_no_race(self) -> None:
        """Test that concurrent scoring of the same context doesn't cause race conditions.

        This verifies that the per-context locking mechanism prevents the same context
        from being scored multiple times when scored concurrently.
        """
        import asyncio

        # Use scorer with recency_weight=0 to eliminate time-dependent variation
        # (recency scores change as time passes between calls)
        scorer = CompositeScorer(
            relevance_weight=0.5,
            recency_weight=0.0,  # Disable recency to get deterministic results
            priority_weight=0.5,
        )

        # Create a single context that will be scored multiple times concurrently
        context = KnowledgeContext(
            content="Test content for race condition test",
            source="docs",
            relevance_score=0.75,
        )

        # Score the same context many times in parallel
        num_concurrent = 50
        tasks = [scorer.score(context, "test query") for _ in range(num_concurrent)]
        scores = await asyncio.gather(*tasks)

        # All scores should be identical (deterministic scoring without recency)
        assert all(s == scores[0] for s in scores)
        # Note: We don't cache _score on context because scores are query-dependent

    @pytest.mark.asyncio
    async def test_concurrent_scoring_different_contexts(self) -> None:
        """Test that concurrent scoring of different contexts works correctly.

        Different contexts should not interfere with each other during parallel scoring.
        """
        import asyncio

        scorer = CompositeScorer()

        # Create many different contexts
        contexts = [
            KnowledgeContext(
                content=f"Test content {i}",
                source="docs",
                relevance_score=i / 10,
            )
            for i in range(10)
        ]

        # Score all contexts concurrently
        tasks = [scorer.score(ctx, "test query") for ctx in contexts]
        scores = await asyncio.gather(*tasks)

        # Each context should have a different score based on its relevance
        assert len(set(scores)) > 1  # Not all the same
        # Note: We don't cache _score on context because scores are query-dependent


class TestScoredContext:
    """Tests for ScoredContext dataclass."""

    def test_creation(self) -> None:
        """Test ScoredContext creation."""
        context = TaskContext(content="Test", source="task")
        scored = ScoredContext(
            context=context,
            composite_score=0.75,
            relevance_score=0.8,
            recency_score=0.7,
            priority_score=0.5,
        )

        assert scored.context is context
        assert scored.composite_score == 0.75

    def test_comparison_operators(self) -> None:
        """Test comparison operators for sorting."""
        ctx1 = TaskContext(content="1", source="task")
        ctx2 = TaskContext(content="2", source="task")

        scored1 = ScoredContext(context=ctx1, composite_score=0.5)
        scored2 = ScoredContext(context=ctx2, composite_score=0.8)

        assert scored1 < scored2
        assert scored2 > scored1

    def test_sorting(self) -> None:
        """Test sorting scored contexts."""
        contexts = [
            ScoredContext(
                context=TaskContext(content="Low", source="task"),
                composite_score=0.3,
            ),
            ScoredContext(
                context=TaskContext(content="High", source="task"),
                composite_score=0.9,
            ),
            ScoredContext(
                context=TaskContext(content="Medium", source="task"),
                composite_score=0.6,
            ),
        ]

        sorted_contexts = sorted(contexts, reverse=True)

        assert sorted_contexts[0].composite_score == 0.9
        assert sorted_contexts[1].composite_score == 0.6
        assert sorted_contexts[2].composite_score == 0.3


class TestBaseScorer:
    """Tests for BaseScorer abstract class."""

    def test_weight_property(self) -> None:
        """Test weight property."""
        # Use a concrete implementation
        scorer = RelevanceScorer(weight=0.7)
        assert scorer.weight == 0.7

    def test_weight_setter_valid(self) -> None:
        """Test weight setter with valid values."""
        scorer = RelevanceScorer()
        scorer.weight = 0.5
        assert scorer.weight == 0.5

    def test_weight_setter_invalid(self) -> None:
        """Test weight setter with invalid values."""
        scorer = RelevanceScorer()

        with pytest.raises(ValueError):
            scorer.weight = -0.1

        with pytest.raises(ValueError):
            scorer.weight = 1.5

    def test_normalize_score(self) -> None:
        """Test score normalization."""
        scorer = RelevanceScorer()

        # Normal range
        assert scorer.normalize_score(0.5) == 0.5

        # Below 0
        assert scorer.normalize_score(-0.5) == 0.0

        # Above 1
        assert scorer.normalize_score(1.5) == 1.0

        # Boundaries
        assert scorer.normalize_score(0.0) == 0.0
        assert scorer.normalize_score(1.0) == 1.0


class TestCompositeScorerEdgeCases:
    """Tests for CompositeScorer edge cases and lock management."""

    @pytest.mark.asyncio
    async def test_score_with_zero_weights(self) -> None:
        """Test scoring when all weights are zero."""
        scorer = CompositeScorer(
            relevance_weight=0.0,
            recency_weight=0.0,
            priority_weight=0.0,
        )

        context = KnowledgeContext(
            content="Test content",
            source="docs",
            relevance_score=0.8,
        )

        # Should return 0.0 when total weight is 0
        score = await scorer.score(context, "test query")
        assert score == 0.0

    @pytest.mark.asyncio
    async def test_score_batch_sequential(self) -> None:
        """Test batch scoring in sequential mode (parallel=False)."""
        scorer = CompositeScorer()

        contexts = [
            KnowledgeContext(
                content="Content 1",
                source="docs",
                relevance_score=0.8,
            ),
            KnowledgeContext(
                content="Content 2",
                source="docs",
                relevance_score=0.5,
            ),
        ]

        # Use parallel=False to cover the sequential path
        scored = await scorer.score_batch(contexts, "query", parallel=False)

        assert len(scored) == 2
        assert scored[0].relevance_score == 0.8
        assert scored[1].relevance_score == 0.5

    @pytest.mark.asyncio
    async def test_lock_fast_path_reuse(self) -> None:
        """Test that existing locks are reused via fast path."""
        scorer = CompositeScorer()

        context = KnowledgeContext(
            content="Test",
            source="docs",
            relevance_score=0.5,
        )

        # First access creates the lock
        lock1 = await scorer._get_context_lock(context.id)

        # Second access should hit the fast path (lock exists in dict)
        lock2 = await scorer._get_context_lock(context.id)

        assert lock2 is lock1  # Same lock object returned

    @pytest.mark.asyncio
    async def test_lock_cleanup_when_limit_reached(self) -> None:
        """Test that old locks are cleaned up when limit is reached."""
        import time

        # Create scorer with very low max_locks to trigger cleanup
        scorer = CompositeScorer()
        scorer._max_locks = 3
        scorer._lock_ttl = 0.1  # 100ms TTL

        # Create locks for several context IDs
        context_ids = [f"ctx-{i}" for i in range(5)]

        # Get locks for first 3 contexts (fill up to limit)
        for ctx_id in context_ids[:3]:
            await scorer._get_context_lock(ctx_id)

        # Wait for TTL to expire
        time.sleep(0.15)

        # Getting a lock for a new context should trigger cleanup
        await scorer._get_context_lock(context_ids[3])

        # Some old locks should have been cleaned up
        # The exact number depends on cleanup logic
        assert len(scorer._context_locks) <= scorer._max_locks + 1

    @pytest.mark.asyncio
    async def test_lock_cleanup_preserves_held_locks(self) -> None:
        """Test that cleanup doesn't remove locks that are currently held."""
        import time

        scorer = CompositeScorer()
        scorer._max_locks = 2
        scorer._lock_ttl = 0.05  # 50ms TTL

        # Get and hold lock1
        lock1 = await scorer._get_context_lock("ctx-1")
        async with lock1:
            # While holding lock1, add more locks
            await scorer._get_context_lock("ctx-2")
            time.sleep(0.1)  # Let TTL expire
            # Adding another should trigger cleanup
            await scorer._get_context_lock("ctx-3")

            # lock1 should still exist (it's held)
            assert any(lock is lock1 for lock, _ in scorer._context_locks.values())

    @pytest.mark.asyncio
    async def test_concurrent_lock_acquisition_double_check(self) -> None:
        """Test that concurrent lock acquisition uses double-check pattern."""
        import asyncio

        scorer = CompositeScorer()

        context_id = "test-context-id"

        # Simulate concurrent lock acquisition
        async def get_lock():
            return await scorer._get_context_lock(context_id)

        locks = await asyncio.gather(*[get_lock() for _ in range(10)])

        # All should get the same lock (double-check pattern ensures this)
        assert all(lock is locks[0] for lock in locks)