Files
strix-halo-optimizations/scripts/agentic/run-eval.sh
Felipe Cardoso 58124cd657 feat: add Qwen3.5 model catalog and agentic evaluation framework
Models:
- configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick),
  Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding)
- Updated benchmark setup to show catalog with download status
- docs/model-recommendations.md: memory planning, quantization guide

Agentic evaluation:
- scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench
  in a Python venv
- scripts/agentic/run-eval.sh: runs evaluations against local LLM server
  (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code
  (EvalPlus+BigCodeBench), tooluse (BFCL), full (all)
- bin/agentic: dispatcher with help
- docs/agentic-benchmarks.md: methodology, framework comparison, model
  recommendations for agentic use

Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:20:23 +01:00

181 lines
5.9 KiB
Bash

#!/usr/bin/env bash
# Run agentic evaluations against a local LLM server
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
VENV_DIR="$(data_dir venv)"
EVAL_DIR="$(data_dir evals)"
# ── Argument parsing ─────────────────────────────────────
SUITE="quick"
MODEL=""
ENDPOINT="http://localhost:11434/v1" # ollama default OpenAI-compat endpoint
PROVIDER="openai"
while [[ $# -gt 0 ]]; do
case "$1" in
--suite|-s) SUITE="$2"; shift 2 ;;
--model|-m) MODEL="$2"; shift 2 ;;
--endpoint|-e) ENDPOINT="$2"; shift 2 ;;
--help|-h)
echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
echo ""
echo "Suites:"
echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)"
echo " code EvalPlus + BigCodeBench (~2-3 hours)"
echo " tooluse BFCL function calling (~1-2 hours)"
echo " full All of the above (~5-6 hours)"
echo ""
echo "Options:"
echo " --model Model name as known by the server (e.g., qwen3.5:35b-a3b)"
echo " --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;;
esac
done
# ── Validation ───────────────────────────────────────────
if [[ -z "$MODEL" ]]; then
log_error "Model name required. Use --model NAME"
log_info "Examples:"
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
exit 1
fi
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
log_error "Virtual environment not found. Run: make agentic-setup"
exit 1
fi
source "$VENV_DIR/bin/activate"
# Check server is reachable
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
# Try ollama native endpoint
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
log_info "Ollama detected, using OpenAI-compat endpoint"
else
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
exit 1
fi
fi
TS="$(timestamp)"
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
mkdir -p "$RUN_DIR"
log_header "Agentic Evaluation: $SUITE"
log_info "Model: $MODEL"
log_info "Endpoint: $ENDPOINT"
log_info "Results: $RUN_DIR"
# Save run metadata
cat > "$RUN_DIR/metadata.json" << ENDJSON
{
"suite": "$SUITE",
"model": "$MODEL",
"endpoint": "$ENDPOINT",
"timestamp": "$TS",
"hostname": "$(hostname)"
}
ENDJSON
# ── Start metric logging ────────────────────────────────
METRICS_FILE="$RUN_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
METRICS_PID=$!
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
# ── Suite execution ──────────────────────────────────────
run_evalplus() {
local bench="$1" # humaneval or mbpp
log_info "Running EvalPlus $bench..."
local out="$RUN_DIR/evalplus-${bench}.json"
OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
--model "$MODEL" \
--backend openai \
--dataset "$bench" \
--greedy \
2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
# Copy results if generated
local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
if [[ -d "$result_dir" ]]; then
cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
fi
log_success "EvalPlus $bench complete"
}
run_inspect_eval() {
local eval_name="$1"
local display_name="$2"
log_info "Running Inspect AI: $display_name..."
local out="$RUN_DIR/inspect-${eval_name}.json"
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
inspect eval "$eval_name" \
--model "openai/$MODEL" \
--log-dir "$RUN_DIR/inspect-logs/" \
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
log_success "Inspect $display_name complete"
}
run_bigcodebench() {
log_info "Running BigCodeBench..."
OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
--model "$MODEL" \
--backend openai \
--subset complete \
2>&1 | tee "$RUN_DIR/bigcodebench.log"
log_success "BigCodeBench complete"
}
case "$SUITE" in
quick)
run_evalplus "humaneval"
run_inspect_eval "ifeval" "IFEval (instruction following)"
;;
code)
run_evalplus "humaneval"
run_evalplus "mbpp"
run_bigcodebench
;;
tooluse)
run_inspect_eval "bfcl" "BFCL (function calling)"
;;
full)
run_evalplus "humaneval"
run_evalplus "mbpp"
run_inspect_eval "ifeval" "IFEval (instruction following)"
run_inspect_eval "bfcl" "BFCL (function calling)"
run_bigcodebench
;;
*)
log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
exit 1
;;
esac
# ── Summary ──────────────────────────────────────────────
log_header "Evaluation Complete"
log_info "Results saved to: $RUN_DIR"
log_info "Contents:"
ls -1 "$RUN_DIR" | sed 's/^/ /'
# Parse and display results summary
log_header "Results Summary"
for logfile in "$RUN_DIR"/*.log; do
[[ -f "$logfile" ]] || continue
local_name="$(basename "$logfile" .log)"
echo ""
echo " --- $local_name ---"
# Try to extract pass rates from common output formats
grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/ /'
done
echo ""