#!/usr/bin/env bash # Setup agentic evaluation tools set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" log_header "Agentic Evaluation Setup" # ── Python virtual environment ─────────────────────────── VENV_DIR="$PROJECT_ROOT/.venv" REQUIREMENTS="$PROJECT_ROOT/requirements.txt" if [[ ! -f "$VENV_DIR/bin/activate" ]]; then # Prefer Python 3.13 (bigcodebench requires <3.14) PYTHON_BIN="python3.13" if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then PYTHON_BIN="python3" log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install." fi log_info "Creating virtual environment with $($PYTHON_BIN --version)..." "$PYTHON_BIN" -m venv "$VENV_DIR" log_success "Virtual environment created at $VENV_DIR" fi source "$VENV_DIR/bin/activate" log_info "Python: $(python3 --version) from $VENV_DIR" # ── Install from requirements.txt ──────────────────────── log_info "Installing dependencies from requirements.txt..." pip install -r "$REQUIREMENTS" 2>&1 | tail -5 log_success "Dependencies installed" # ── Check for local LLM server ────────────────────────── log_header "LLM Server Check" if curl -sf http://localhost:8080/health >/dev/null 2>&1; then log_success "llama-server running at localhost:8080" elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then log_success "ollama running at localhost:11434" else log_warn "No local LLM server running. Start one before running evals:" log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)" log_info " ollama serve (ollama)" fi # ── Summary ────────────────────────────────────────────── log_header "Setup Complete" echo "" echo " Installed tools:" echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)" echo " inspect-evals — Task definitions for inspect-ai" echo " evalplus — HumanEval+ / MBPP+ with native ollama support" echo " bigcodebench — 1,140 coding tasks across 139 libraries" echo "" echo " Activate venv: source .venv/bin/activate" echo "" echo " Run evaluations:" echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)" echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)" echo " make agentic-tooluse # BFCL function calling (~1-2 hours)" echo " make agentic-full # All of the above (~5-6 hours)" echo ""