- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
211 lines
7.0 KiB
Bash
211 lines
7.0 KiB
Bash
#!/usr/bin/env bash
|
|
# Run agentic evaluations against a local LLM server
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
|
|
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
|
EVAL_DIR="$(data_dir evals)"
|
|
|
|
# ── Argument parsing ─────────────────────────────────────
|
|
SUITE="quick"
|
|
MODEL=""
|
|
ENDPOINT="http://localhost:11434/v1" # ollama default OpenAI-compat endpoint
|
|
PROVIDER="openai"
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--suite|-s) SUITE="$2"; shift 2 ;;
|
|
--model|-m) MODEL="$2"; shift 2 ;;
|
|
--endpoint|-e) ENDPOINT="$2"; shift 2 ;;
|
|
--help|-h)
|
|
echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
|
|
echo ""
|
|
echo "Suites:"
|
|
echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)"
|
|
echo " code EvalPlus + BigCodeBench (~2-3 hours)"
|
|
echo " tooluse BFCL function calling (~1-2 hours)"
|
|
echo " full All of the above (~5-6 hours)"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --model Model name as known by the server (e.g., qwen3.5:35b-a3b)"
|
|
echo " --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
|
|
exit 0 ;;
|
|
*) log_warn "Unknown argument: $1"; shift ;;
|
|
esac
|
|
done
|
|
|
|
# ── Validation ───────────────────────────────────────────
|
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
|
log_error "Virtual environment not found. Run: make agentic-setup"
|
|
exit 1
|
|
fi
|
|
source "$VENV_DIR/bin/activate"
|
|
|
|
# Auto-detect server if no explicit endpoint given
|
|
if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
|
|
if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
|
|
ENDPOINT="http://localhost:8080/v1"
|
|
log_info "Auto-detected llama-server at localhost:8080"
|
|
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
|
log_info "Auto-detected ollama at localhost:11434"
|
|
else
|
|
log_error "No LLM server found. Start one first:"
|
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
|
log_info " ollama serve (ollama)"
|
|
exit 1
|
|
fi
|
|
else
|
|
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
|
|
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
|
log_error "No LLM server at $ENDPOINT"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# Auto-detect model name from server if not provided
|
|
if [[ -z "$MODEL" ]]; then
|
|
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
|
|
import sys, json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
models = data.get('data', [])
|
|
if models:
|
|
print(models[0].get('id', ''))
|
|
except: pass
|
|
" 2>/dev/null || true)
|
|
if [[ -n "$DETECTED_MODEL" ]]; then
|
|
MODEL="$DETECTED_MODEL"
|
|
log_info "Auto-detected model: $MODEL"
|
|
else
|
|
log_error "Model name required. Use --model NAME"
|
|
log_info "Examples:"
|
|
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
|
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
TS="$(timestamp)"
|
|
SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
|
|
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
|
|
mkdir -p "$RUN_DIR"
|
|
|
|
log_header "Agentic Evaluation: $SUITE"
|
|
log_info "Model: $MODEL"
|
|
log_info "Endpoint: $ENDPOINT"
|
|
log_info "Results: $RUN_DIR"
|
|
|
|
# Save run metadata
|
|
cat > "$RUN_DIR/metadata.json" << ENDJSON
|
|
{
|
|
"suite": "$SUITE",
|
|
"model": "$MODEL",
|
|
"endpoint": "$ENDPOINT",
|
|
"timestamp": "$TS",
|
|
"hostname": "$(hostname)"
|
|
}
|
|
ENDJSON
|
|
|
|
# ── Start metric logging ────────────────────────────────
|
|
METRICS_FILE="$RUN_DIR/metrics.csv"
|
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
|
METRICS_PID=$!
|
|
cleanup() {
|
|
kill "$METRICS_PID" 2>/dev/null || true
|
|
wait "$METRICS_PID" 2>/dev/null || true
|
|
}
|
|
trap 'cleanup; exit 0' EXIT
|
|
|
|
# ── Suite execution ──────────────────────────────────────
|
|
|
|
run_evalplus() {
|
|
local bench="$1" # humaneval or mbpp
|
|
log_info "Running EvalPlus $bench..."
|
|
local out="$RUN_DIR/evalplus-${bench}.json"
|
|
|
|
OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
|
|
--model "$MODEL" \
|
|
--backend openai \
|
|
--dataset "$bench" \
|
|
--greedy \
|
|
2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
|
|
|
|
# Copy results if generated
|
|
local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
|
|
if [[ -d "$result_dir" ]]; then
|
|
cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
|
|
fi
|
|
log_success "EvalPlus $bench complete"
|
|
}
|
|
|
|
run_inspect_eval() {
|
|
local eval_name="$1"
|
|
local display_name="$2"
|
|
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
|
|
log_info "Running Inspect AI: $display_name..."
|
|
|
|
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
|
inspect eval "$eval_name" \
|
|
--model "openai/$MODEL" \
|
|
--log-dir "$RUN_DIR/inspect-logs/" \
|
|
2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
|
|
|
|
log_success "Inspect $display_name complete"
|
|
}
|
|
|
|
run_bigcodebench() {
|
|
log_info "Running BigCodeBench..."
|
|
OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
|
|
--model "$MODEL" \
|
|
--backend openai \
|
|
--subset complete \
|
|
2>&1 | tee "$RUN_DIR/bigcodebench.log"
|
|
log_success "BigCodeBench complete"
|
|
}
|
|
|
|
case "$SUITE" in
|
|
quick)
|
|
run_evalplus "humaneval"
|
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
|
;;
|
|
code)
|
|
run_evalplus "humaneval"
|
|
run_evalplus "mbpp"
|
|
run_bigcodebench
|
|
;;
|
|
tooluse)
|
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
|
;;
|
|
full)
|
|
run_evalplus "humaneval"
|
|
run_evalplus "mbpp"
|
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
|
run_bigcodebench
|
|
;;
|
|
*)
|
|
log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
# ── Summary ──────────────────────────────────────────────
|
|
log_header "Evaluation Complete"
|
|
log_info "Results saved to: $RUN_DIR"
|
|
log_info "Contents:"
|
|
ls -1 "$RUN_DIR" | sed 's/^/ /'
|
|
|
|
# Parse and display results summary
|
|
log_header "Results Summary"
|
|
for logfile in "$RUN_DIR"/*.log; do
|
|
[[ -f "$logfile" ]] || continue
|
|
local_name="$(basename "$logfile" .log)"
|
|
echo ""
|
|
echo " --- $local_name ---"
|
|
# Try to extract pass rates from common output formats
|
|
grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/ /'
|
|
done
|
|
echo ""
|