feat: add Qwen3.5 model catalog and agentic evaluation framework

Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:20:23 +01:00
parent 71053997be
commit 58124cd657
11 changed files with 1354 additions and 16 deletions
--- a/scripts/agentic/run-eval.sh
+++ b/scripts/agentic/run-eval.sh
@@ -0,0 +1,180 @@
+#!/usr/bin/env bash
+# Run agentic evaluations against a local LLM server
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+VENV_DIR="$(data_dir venv)"
+EVAL_DIR="$(data_dir evals)"
+
+# ── Argument parsing ─────────────────────────────────────
+SUITE="quick"
+MODEL=""
+ENDPOINT="http://localhost:11434/v1"  # ollama default OpenAI-compat endpoint
+PROVIDER="openai"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --suite|-s)    SUITE="$2"; shift 2 ;;
+        --model|-m)    MODEL="$2"; shift 2 ;;
+        --endpoint|-e) ENDPOINT="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
+            echo ""
+            echo "Suites:"
+            echo "  quick    EvalPlus HumanEval+ + IFEval (~1 hour)"
+            echo "  code     EvalPlus + BigCodeBench (~2-3 hours)"
+            echo "  tooluse  BFCL function calling (~1-2 hours)"
+            echo "  full     All of the above (~5-6 hours)"
+            echo ""
+            echo "Options:"
+            echo "  --model    Model name as known by the server (e.g., qwen3.5:35b-a3b)"
+            echo "  --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
+            exit 0 ;;
+        *) log_warn "Unknown argument: $1"; shift ;;
+    esac
+done
+
+# ── Validation ───────────────────────────────────────────
+if [[ -z "$MODEL" ]]; then
+    log_error "Model name required. Use --model NAME"
+    log_info "Examples:"
+    log_info "  --model qwen3.5:35b-a3b-q8_0       (ollama)"
+    log_info "  --model Qwen3.5-35B-A3B-Q8_0        (llama.cpp server)"
+    exit 1
+fi
+
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_error "Virtual environment not found. Run: make agentic-setup"
+    exit 1
+fi
+source "$VENV_DIR/bin/activate"
+
+# Check server is reachable
+if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
+    # Try ollama native endpoint
+    if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
+        log_info "Ollama detected, using OpenAI-compat endpoint"
+    else
+        log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
+        exit 1
+    fi
+fi
+
+TS="$(timestamp)"
+RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
+mkdir -p "$RUN_DIR"
+
+log_header "Agentic Evaluation: $SUITE"
+log_info "Model:    $MODEL"
+log_info "Endpoint: $ENDPOINT"
+log_info "Results:  $RUN_DIR"
+
+# Save run metadata
+cat > "$RUN_DIR/metadata.json" << ENDJSON
+{
+    "suite": "$SUITE",
+    "model": "$MODEL",
+    "endpoint": "$ENDPOINT",
+    "timestamp": "$TS",
+    "hostname": "$(hostname)"
+}
+ENDJSON
+
+# ── Start metric logging ────────────────────────────────
+METRICS_FILE="$RUN_DIR/metrics.csv"
+bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
+METRICS_PID=$!
+trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
+
+# ── Suite execution ──────────────────────────────────────
+
+run_evalplus() {
+    local bench="$1"  # humaneval or mbpp
+    log_info "Running EvalPlus $bench..."
+    local out="$RUN_DIR/evalplus-${bench}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --dataset "$bench" \
+        --greedy \
+        2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
+
+    # Copy results if generated
+    local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
+    if [[ -d "$result_dir" ]]; then
+        cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
+    fi
+    log_success "EvalPlus $bench complete"
+}
+
+run_inspect_eval() {
+    local eval_name="$1"
+    local display_name="$2"
+    log_info "Running Inspect AI: $display_name..."
+    local out="$RUN_DIR/inspect-${eval_name}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
+    inspect eval "$eval_name" \
+        --model "openai/$MODEL" \
+        --log-dir "$RUN_DIR/inspect-logs/" \
+        2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
+
+    log_success "Inspect $display_name complete"
+}
+
+run_bigcodebench() {
+    log_info "Running BigCodeBench..."
+    OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --subset complete \
+        2>&1 | tee "$RUN_DIR/bigcodebench.log"
+    log_success "BigCodeBench complete"
+}
+
+case "$SUITE" in
+    quick)
+        run_evalplus "humaneval"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        ;;
+    code)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_bigcodebench
+        ;;
+    tooluse)
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        ;;
+    full)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        run_bigcodebench
+        ;;
+    *)
+        log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
+        exit 1
+        ;;
+esac
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Evaluation Complete"
+log_info "Results saved to: $RUN_DIR"
+log_info "Contents:"
+ls -1 "$RUN_DIR" | sed 's/^/  /'
+
+# Parse and display results summary
+log_header "Results Summary"
+for logfile in "$RUN_DIR"/*.log; do
+    [[ -f "$logfile" ]] || continue
+    local_name="$(basename "$logfile" .log)"
+    echo ""
+    echo "  --- $local_name ---"
+    # Try to extract pass rates from common output formats
+    grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/  /'
+done
+echo ""
--- a/scripts/agentic/setup.sh
+++ b/scripts/agentic/setup.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Setup agentic evaluation tools
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+log_header "Agentic Evaluation Setup"
+
+# ── Python virtual environment ───────────────────────────
+VENV_DIR="$(data_dir venv)"
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_info "Creating Python virtual environment..."
+    python3 -m venv "$VENV_DIR"
+    log_success "Virtual environment created at $VENV_DIR"
+fi
+
+source "$VENV_DIR/bin/activate"
+log_info "Python: $(python3 --version) from $VENV_DIR"
+
+# ── Install evaluation frameworks ────────────────────────
+
+# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
+if python3 -c "import inspect_ai" 2>/dev/null; then
+    log_success "inspect-ai already installed"
+else
+    log_info "Installing inspect-ai (main eval framework)..."
+    pip install inspect-ai 2>&1 | tail -3
+    log_success "inspect-ai installed"
+fi
+
+# EvalPlus — HumanEval+ and MBPP+ with native ollama support
+if python3 -c "import evalplus" 2>/dev/null; then
+    log_success "evalplus already installed"
+else
+    log_info "Installing evalplus (code generation benchmarks)..."
+    pip install evalplus 2>&1 | tail -3
+    log_success "evalplus installed"
+fi
+
+# BigCodeBench
+if python3 -c "import bigcodebench" 2>/dev/null; then
+    log_success "bigcodebench already installed"
+else
+    log_info "Installing bigcodebench..."
+    pip install bigcodebench 2>&1 | tail -3
+    log_success "bigcodebench installed"
+fi
+
+# ── Check for local LLM server ──────────────────────────
+log_header "LLM Server Check"
+
+ollama_ok=false
+llamacpp_ok=false
+
+if is_cmd ollama; then
+    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+        log_success "ollama running at localhost:11434"
+        ollama_ok=true
+        # List available models
+        log_info "Available ollama models:"
+        ollama list 2>/dev/null | head -10 || true
+    else
+        log_warn "ollama installed but not running. Start with: ollama serve"
+    fi
+else
+    log_info "ollama not installed — needed for most agentic benchmarks"
+    log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
+fi
+
+# Check for llama.cpp server
+if curl -s http://localhost:8080/health >/dev/null 2>&1; then
+    log_success "llama.cpp server running at localhost:8080"
+    llamacpp_ok=true
+else
+    log_info "No llama.cpp server detected at localhost:8080"
+    log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
+fi
+
+if ! $ollama_ok && ! $llamacpp_ok; then
+    log_warn "No local LLM server running. Agentic benchmarks need one."
+fi
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Setup Complete"
+echo ""
+echo "  Installed tools:"
+echo "    inspect-ai     — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
+echo "    evalplus        — HumanEval+ / MBPP+ with native ollama support"
+echo "    bigcodebench    — 1,140 coding tasks across 139 libraries"
+echo ""
+echo "  To activate the virtual environment:"
+echo "    source data/venv/bin/activate"
+echo ""
+echo "  Run evaluations:"
+echo "    make agentic-quick      # EvalPlus + IFEval (~1 hour)"
+echo "    make agentic-full       # BFCL + BigCodeBench (~3-4 hours)"
+echo ""