feat: add Qwen3.5 model catalog and agentic evaluation framework

Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:20:23 +01:00
parent 71053997be
commit 58124cd657
11 changed files with 1354 additions and 16 deletions
--- a/scripts/agentic/run-eval.sh
+++ b/scripts/agentic/run-eval.sh
@@ -0,0 +1,180 @@
+#!/usr/bin/env bash
+# Run agentic evaluations against a local LLM server
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+VENV_DIR="$(data_dir venv)"
+EVAL_DIR="$(data_dir evals)"
+
+# ── Argument parsing ─────────────────────────────────────
+SUITE="quick"
+MODEL=""
+ENDPOINT="http://localhost:11434/v1"  # ollama default OpenAI-compat endpoint
+PROVIDER="openai"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --suite|-s)    SUITE="$2"; shift 2 ;;
+        --model|-m)    MODEL="$2"; shift 2 ;;
+        --endpoint|-e) ENDPOINT="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
+            echo ""
+            echo "Suites:"
+            echo "  quick    EvalPlus HumanEval+ + IFEval (~1 hour)"
+            echo "  code     EvalPlus + BigCodeBench (~2-3 hours)"
+            echo "  tooluse  BFCL function calling (~1-2 hours)"
+            echo "  full     All of the above (~5-6 hours)"
+            echo ""
+            echo "Options:"
+            echo "  --model    Model name as known by the server (e.g., qwen3.5:35b-a3b)"
+            echo "  --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
+            exit 0 ;;
+        *) log_warn "Unknown argument: $1"; shift ;;
+    esac
+done
+
+# ── Validation ───────────────────────────────────────────
+if [[ -z "$MODEL" ]]; then
+    log_error "Model name required. Use --model NAME"
+    log_info "Examples:"
+    log_info "  --model qwen3.5:35b-a3b-q8_0       (ollama)"
+    log_info "  --model Qwen3.5-35B-A3B-Q8_0        (llama.cpp server)"
+    exit 1
+fi
+
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_error "Virtual environment not found. Run: make agentic-setup"
+    exit 1
+fi
+source "$VENV_DIR/bin/activate"
+
+# Check server is reachable
+if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
+    # Try ollama native endpoint
+    if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
+        log_info "Ollama detected, using OpenAI-compat endpoint"
+    else
+        log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
+        exit 1
+    fi
+fi
+
+TS="$(timestamp)"
+RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
+mkdir -p "$RUN_DIR"
+
+log_header "Agentic Evaluation: $SUITE"
+log_info "Model:    $MODEL"
+log_info "Endpoint: $ENDPOINT"
+log_info "Results:  $RUN_DIR"
+
+# Save run metadata
+cat > "$RUN_DIR/metadata.json" << ENDJSON
+{
+    "suite": "$SUITE",
+    "model": "$MODEL",
+    "endpoint": "$ENDPOINT",
+    "timestamp": "$TS",
+    "hostname": "$(hostname)"
+}
+ENDJSON
+
+# ── Start metric logging ────────────────────────────────
+METRICS_FILE="$RUN_DIR/metrics.csv"
+bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
+METRICS_PID=$!
+trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
+
+# ── Suite execution ──────────────────────────────────────
+
+run_evalplus() {
+    local bench="$1"  # humaneval or mbpp
+    log_info "Running EvalPlus $bench..."
+    local out="$RUN_DIR/evalplus-${bench}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --dataset "$bench" \
+        --greedy \
+        2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
+
+    # Copy results if generated
+    local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
+    if [[ -d "$result_dir" ]]; then
+        cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
+    fi
+    log_success "EvalPlus $bench complete"
+}
+
+run_inspect_eval() {
+    local eval_name="$1"
+    local display_name="$2"
+    log_info "Running Inspect AI: $display_name..."
+    local out="$RUN_DIR/inspect-${eval_name}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
+    inspect eval "$eval_name" \
+        --model "openai/$MODEL" \
+        --log-dir "$RUN_DIR/inspect-logs/" \
+        2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
+
+    log_success "Inspect $display_name complete"
+}
+
+run_bigcodebench() {
+    log_info "Running BigCodeBench..."
+    OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --subset complete \
+        2>&1 | tee "$RUN_DIR/bigcodebench.log"
+    log_success "BigCodeBench complete"
+}
+
+case "$SUITE" in
+    quick)
+        run_evalplus "humaneval"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        ;;
+    code)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_bigcodebench
+        ;;
+    tooluse)
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        ;;
+    full)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        run_bigcodebench
+        ;;
+    *)
+        log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
+        exit 1
+        ;;
+esac
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Evaluation Complete"
+log_info "Results saved to: $RUN_DIR"
+log_info "Contents:"
+ls -1 "$RUN_DIR" | sed 's/^/  /'
+
+# Parse and display results summary
+log_header "Results Summary"
+for logfile in "$RUN_DIR"/*.log; do
+    [[ -f "$logfile" ]] || continue
+    local_name="$(basename "$logfile" .log)"
+    echo ""
+    echo "  --- $local_name ---"
+    # Try to extract pass rates from common output formats
+    grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/  /'
+done
+echo ""