#!/usr/bin/env bash # Run agentic evaluations against a local LLM server set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" VENV_DIR="$PROJECT_ROOT/.venv" EVAL_DIR="$(data_dir evals)" # ── Argument parsing ───────────────────────────────────── SUITE="quick" MODEL="" ENDPOINT="http://localhost:11434/v1" # ollama default OpenAI-compat endpoint PROVIDER="openai" while [[ $# -gt 0 ]]; do case "$1" in --suite|-s) SUITE="$2"; shift 2 ;; --model|-m) MODEL="$2"; shift 2 ;; --endpoint|-e) ENDPOINT="$2"; shift 2 ;; --help|-h) echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]" echo "" echo "Suites:" echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)" echo " code EvalPlus + BigCodeBench (~2-3 hours)" echo " tooluse BFCL function calling (~1-2 hours)" echo " full All of the above (~5-6 hours)" echo "" echo "Options:" echo " --model Model name as known by the server (e.g., qwen3.5:35b-a3b)" echo " --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)" exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; esac done # ── Validation ─────────────────────────────────────────── if [[ ! -f "$VENV_DIR/bin/activate" ]]; then log_error "Virtual environment not found. Run: make agentic-setup" exit 1 fi source "$VENV_DIR/bin/activate" # Auto-detect server if no explicit endpoint given if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then ENDPOINT="http://localhost:8080/v1" log_info "Auto-detected llama-server at localhost:8080" elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then log_info "Auto-detected ollama at localhost:11434" else log_error "No LLM server found. Start one first:" log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)" log_info " ollama serve (ollama)" exit 1 fi else if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \ ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then log_error "No LLM server at $ENDPOINT" exit 1 fi fi # Auto-detect model name from server if not provided if [[ -z "$MODEL" ]]; then DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c " import sys, json try: data = json.load(sys.stdin) models = data.get('data', []) if models: print(models[0].get('id', '')) except: pass " 2>/dev/null || true) if [[ -n "$DETECTED_MODEL" ]]; then MODEL="$DETECTED_MODEL" log_info "Auto-detected model: $MODEL" else log_error "Model name required. Use --model NAME" log_info "Examples:" log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)" log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)" exit 1 fi fi TS="$(timestamp)" SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')" RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}" mkdir -p "$RUN_DIR" log_header "Agentic Evaluation: $SUITE" log_info "Model: $MODEL" log_info "Endpoint: $ENDPOINT" log_info "Results: $RUN_DIR" # Save run metadata cat > "$RUN_DIR/metadata.json" << ENDJSON { "suite": "$SUITE", "model": "$MODEL", "endpoint": "$ENDPOINT", "timestamp": "$TS", "hostname": "$(hostname)" } ENDJSON # ── Start metric logging ──────────────────────────────── METRICS_FILE="$RUN_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 & METRICS_PID=$! cleanup() { kill "$METRICS_PID" 2>/dev/null || true wait "$METRICS_PID" 2>/dev/null || true } trap 'cleanup; exit 0' EXIT # ── Suite execution ────────────────────────────────────── run_evalplus() { local bench="$1" # humaneval or mbpp log_info "Running EvalPlus $bench..." local out="$RUN_DIR/evalplus-${bench}.json" OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \ --model "$MODEL" \ --backend openai \ --dataset "$bench" \ --greedy \ 2>&1 | tee "$RUN_DIR/evalplus-${bench}.log" # Copy results if generated local result_dir="$HOME/.evalplus/${MODEL}/${bench}" if [[ -d "$result_dir" ]]; then cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true fi log_success "EvalPlus $bench complete" } run_inspect_eval() { local eval_name="$1" local display_name="$2" local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval log_info "Running Inspect AI: $display_name..." OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \ inspect eval "$eval_name" \ --model "openai/$MODEL" \ --log-dir "$RUN_DIR/inspect-logs/" \ 2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log" log_success "Inspect $display_name complete" } run_bigcodebench() { log_info "Running BigCodeBench..." OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \ --model "$MODEL" \ --backend openai \ --subset complete \ 2>&1 | tee "$RUN_DIR/bigcodebench.log" log_success "BigCodeBench complete" } case "$SUITE" in quick) run_evalplus "humaneval" run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)" ;; code) run_evalplus "humaneval" run_evalplus "mbpp" run_bigcodebench ;; tooluse) run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)" ;; full) run_evalplus "humaneval" run_evalplus "mbpp" run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)" run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)" run_bigcodebench ;; *) log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full" exit 1 ;; esac # ── Summary ────────────────────────────────────────────── log_header "Evaluation Complete" log_info "Results saved to: $RUN_DIR" log_info "Contents:" ls -1 "$RUN_DIR" | sed 's/^/ /' # Parse and display results summary log_header "Results Summary" for logfile in "$RUN_DIR"/*.log; do [[ -f "$logfile" ]] || continue local_name="$(basename "$logfile" .log)" echo "" echo " --- $local_name ---" # Try to extract pass rates from common output formats grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/ /' done echo ""