strix-halo-optimizations/scripts/agentic/setup.sh

#!/usr/bin/env bash
# Setup agentic evaluation tools
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"

log_header "Agentic Evaluation Setup"

# ── Python virtual environment ───────────────────────────
VENV_DIR="$(data_dir venv)"
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
    log_info "Creating Python virtual environment..."
    python3 -m venv "$VENV_DIR"
    log_success "Virtual environment created at $VENV_DIR"
fi

source "$VENV_DIR/bin/activate"
log_info "Python: $(python3 --version) from $VENV_DIR"

# ── Install evaluation frameworks ────────────────────────

# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
if python3 -c "import inspect_ai" 2>/dev/null; then
    log_success "inspect-ai already installed"
else
    log_info "Installing inspect-ai (main eval framework)..."
    pip install inspect-ai 2>&1 | tail -3
    log_success "inspect-ai installed"
fi

# EvalPlus — HumanEval+ and MBPP+ with native ollama support
if python3 -c "import evalplus" 2>/dev/null; then
    log_success "evalplus already installed"
else
    log_info "Installing evalplus (code generation benchmarks)..."
    pip install evalplus 2>&1 | tail -3
    log_success "evalplus installed"
fi

# BigCodeBench
if python3 -c "import bigcodebench" 2>/dev/null; then
    log_success "bigcodebench already installed"
else
    log_info "Installing bigcodebench..."
    pip install bigcodebench 2>&1 | tail -3
    log_success "bigcodebench installed"
fi

# ── Check for local LLM server ──────────────────────────
log_header "LLM Server Check"

ollama_ok=false
llamacpp_ok=false

if is_cmd ollama; then
    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
        log_success "ollama running at localhost:11434"
        ollama_ok=true
        # List available models
        log_info "Available ollama models:"
        ollama list 2>/dev/null | head -10 || true
    else
        log_warn "ollama installed but not running. Start with: ollama serve"
    fi
else
    log_info "ollama not installed — needed for most agentic benchmarks"
    log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
fi

# Check for llama.cpp server
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
    log_success "llama.cpp server running at localhost:8080"
    llamacpp_ok=true
else
    log_info "No llama.cpp server detected at localhost:8080"
    log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
fi

if ! $ollama_ok && ! $llamacpp_ok; then
    log_warn "No local LLM server running. Agentic benchmarks need one."
fi

# ── Summary ──────────────────────────────────────────────
log_header "Setup Complete"
echo ""
echo "  Installed tools:"
echo "    inspect-ai     — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
echo "    evalplus        — HumanEval+ / MBPP+ with native ollama support"
echo "    bigcodebench    — 1,140 coding tasks across 139 libraries"
echo ""
echo "  To activate the virtual environment:"
echo "    source data/venv/bin/activate"
echo ""
echo "  Run evaluations:"
echo "    make agentic-quick      # EvalPlus + IFEval (~1 hour)"
echo "    make agentic-full       # BFCL + BigCodeBench (~3-4 hours)"
echo ""