#!/usr/bin/env bash # Setup agentic evaluation tools set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" log_header "Agentic Evaluation Setup" # ── Python virtual environment ─────────────────────────── VENV_DIR="$(data_dir venv)" if [[ ! -f "$VENV_DIR/bin/activate" ]]; then log_info "Creating Python virtual environment..." python3 -m venv "$VENV_DIR" log_success "Virtual environment created at $VENV_DIR" fi source "$VENV_DIR/bin/activate" log_info "Python: $(python3 --version) from $VENV_DIR" # ── Install evaluation frameworks ──────────────────────── # Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.) if python3 -c "import inspect_ai" 2>/dev/null; then log_success "inspect-ai already installed" else log_info "Installing inspect-ai (main eval framework)..." pip install inspect-ai 2>&1 | tail -3 log_success "inspect-ai installed" fi # EvalPlus — HumanEval+ and MBPP+ with native ollama support if python3 -c "import evalplus" 2>/dev/null; then log_success "evalplus already installed" else log_info "Installing evalplus (code generation benchmarks)..." pip install evalplus 2>&1 | tail -3 log_success "evalplus installed" fi # BigCodeBench if python3 -c "import bigcodebench" 2>/dev/null; then log_success "bigcodebench already installed" else log_info "Installing bigcodebench..." pip install bigcodebench 2>&1 | tail -3 log_success "bigcodebench installed" fi # ── Check for local LLM server ────────────────────────── log_header "LLM Server Check" ollama_ok=false llamacpp_ok=false if is_cmd ollama; then if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then log_success "ollama running at localhost:11434" ollama_ok=true # List available models log_info "Available ollama models:" ollama list 2>/dev/null | head -10 || true else log_warn "ollama installed but not running. Start with: ollama serve" fi else log_info "ollama not installed — needed for most agentic benchmarks" log_info "Install: curl -fsSL https://ollama.com/install.sh | sh" fi # Check for llama.cpp server if curl -s http://localhost:8080/health >/dev/null 2>&1; then log_success "llama.cpp server running at localhost:8080" llamacpp_ok=true else log_info "No llama.cpp server detected at localhost:8080" log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap" fi if ! $ollama_ok && ! $llamacpp_ok; then log_warn "No local LLM server running. Agentic benchmarks need one." fi # ── Summary ────────────────────────────────────────────── log_header "Setup Complete" echo "" echo " Installed tools:" echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)" echo " evalplus — HumanEval+ / MBPP+ with native ollama support" echo " bigcodebench — 1,140 coding tasks across 139 libraries" echo "" echo " To activate the virtual environment:" echo " source data/venv/bin/activate" echo "" echo " Run evaluations:" echo " make agentic-quick # EvalPlus + IFEval (~1 hour)" echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)" echo ""