#!/usr/bin/env bash
# Setup agentic evaluation tools
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"

log_header "Agentic Evaluation Setup"

# ── Python virtual environment ───────────────────────────
VENV_DIR="$PROJECT_ROOT/.venv"
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"

if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
    # Prefer Python 3.13 (bigcodebench requires <3.14)
    PYTHON_BIN="python3.13"
    if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
        PYTHON_BIN="python3"
        log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
    fi
    log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
    "$PYTHON_BIN" -m venv "$VENV_DIR"
    log_success "Virtual environment created at $VENV_DIR"
fi

source "$VENV_DIR/bin/activate"
log_info "Python: $(python3 --version) from $VENV_DIR"

# ── Install from requirements.txt ────────────────────────
log_info "Installing dependencies from requirements.txt..."
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
log_success "Dependencies installed"

# ── Check for local LLM server ──────────────────────────
log_header "LLM Server Check"

if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
    log_success "llama-server running at localhost:8080"
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
    log_success "ollama running at localhost:11434"
else
    log_warn "No local LLM server running. Start one before running evals:"
    log_info "  make serve ARGS=\"-m MODEL.gguf\"        (llama-server)"
    log_info "  ollama serve                            (ollama)"
fi

# ── Summary ──────────────────────────────────────────────
log_header "Setup Complete"
echo ""
echo "  Installed tools:"
echo "    inspect-ai      — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
echo "    inspect-evals   — Task definitions for inspect-ai"
echo "    evalplus         — HumanEval+ / MBPP+ with native ollama support"
echo "    bigcodebench     — 1,140 coding tasks across 139 libraries"
echo ""
echo "  Activate venv:  source .venv/bin/activate"
echo ""
echo "  Run evaluations:"
echo "    make agentic-quick      # EvalPlus HumanEval+ + IFEval (~1 hour)"
echo "    make agentic-code       # EvalPlus + BigCodeBench (~2-3 hours)"
echo "    make agentic-tooluse    # BFCL function calling (~1-2 hours)"
echo "    make agentic-full       # All of the above (~5-6 hours)"
echo ""