feat: add Qwen3.5 model catalog and agentic evaluation framework

Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:20:23 +01:00
parent 71053997be
commit 58124cd657
11 changed files with 1354 additions and 16 deletions
--- a/scripts/agentic/run-eval.sh
+++ b/scripts/agentic/run-eval.sh
@@ -0,0 +1,180 @@
+#!/usr/bin/env bash
+# Run agentic evaluations against a local LLM server
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+VENV_DIR="$(data_dir venv)"
+EVAL_DIR="$(data_dir evals)"
+
+# ── Argument parsing ─────────────────────────────────────
+SUITE="quick"
+MODEL=""
+ENDPOINT="http://localhost:11434/v1"  # ollama default OpenAI-compat endpoint
+PROVIDER="openai"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --suite|-s)    SUITE="$2"; shift 2 ;;
+        --model|-m)    MODEL="$2"; shift 2 ;;
+        --endpoint|-e) ENDPOINT="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
+            echo ""
+            echo "Suites:"
+            echo "  quick    EvalPlus HumanEval+ + IFEval (~1 hour)"
+            echo "  code     EvalPlus + BigCodeBench (~2-3 hours)"
+            echo "  tooluse  BFCL function calling (~1-2 hours)"
+            echo "  full     All of the above (~5-6 hours)"
+            echo ""
+            echo "Options:"
+            echo "  --model    Model name as known by the server (e.g., qwen3.5:35b-a3b)"
+            echo "  --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
+            exit 0 ;;
+        *) log_warn "Unknown argument: $1"; shift ;;
+    esac
+done
+
+# ── Validation ───────────────────────────────────────────
+if [[ -z "$MODEL" ]]; then
+    log_error "Model name required. Use --model NAME"
+    log_info "Examples:"
+    log_info "  --model qwen3.5:35b-a3b-q8_0       (ollama)"
+    log_info "  --model Qwen3.5-35B-A3B-Q8_0        (llama.cpp server)"
+    exit 1
+fi
+
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_error "Virtual environment not found. Run: make agentic-setup"
+    exit 1
+fi
+source "$VENV_DIR/bin/activate"
+
+# Check server is reachable
+if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
+    # Try ollama native endpoint
+    if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
+        log_info "Ollama detected, using OpenAI-compat endpoint"
+    else
+        log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
+        exit 1
+    fi
+fi
+
+TS="$(timestamp)"
+RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
+mkdir -p "$RUN_DIR"
+
+log_header "Agentic Evaluation: $SUITE"
+log_info "Model:    $MODEL"
+log_info "Endpoint: $ENDPOINT"
+log_info "Results:  $RUN_DIR"
+
+# Save run metadata
+cat > "$RUN_DIR/metadata.json" << ENDJSON
+{
+    "suite": "$SUITE",
+    "model": "$MODEL",
+    "endpoint": "$ENDPOINT",
+    "timestamp": "$TS",
+    "hostname": "$(hostname)"
+}
+ENDJSON
+
+# ── Start metric logging ────────────────────────────────
+METRICS_FILE="$RUN_DIR/metrics.csv"
+bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
+METRICS_PID=$!
+trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
+
+# ── Suite execution ──────────────────────────────────────
+
+run_evalplus() {
+    local bench="$1"  # humaneval or mbpp
+    log_info "Running EvalPlus $bench..."
+    local out="$RUN_DIR/evalplus-${bench}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --dataset "$bench" \
+        --greedy \
+        2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
+
+    # Copy results if generated
+    local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
+    if [[ -d "$result_dir" ]]; then
+        cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
+    fi
+    log_success "EvalPlus $bench complete"
+}
+
+run_inspect_eval() {
+    local eval_name="$1"
+    local display_name="$2"
+    log_info "Running Inspect AI: $display_name..."
+    local out="$RUN_DIR/inspect-${eval_name}.json"
+
+    OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
+    inspect eval "$eval_name" \
+        --model "openai/$MODEL" \
+        --log-dir "$RUN_DIR/inspect-logs/" \
+        2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
+
+    log_success "Inspect $display_name complete"
+}
+
+run_bigcodebench() {
+    log_info "Running BigCodeBench..."
+    OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
+        --model "$MODEL" \
+        --backend openai \
+        --subset complete \
+        2>&1 | tee "$RUN_DIR/bigcodebench.log"
+    log_success "BigCodeBench complete"
+}
+
+case "$SUITE" in
+    quick)
+        run_evalplus "humaneval"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        ;;
+    code)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_bigcodebench
+        ;;
+    tooluse)
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        ;;
+    full)
+        run_evalplus "humaneval"
+        run_evalplus "mbpp"
+        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        run_inspect_eval "bfcl" "BFCL (function calling)"
+        run_bigcodebench
+        ;;
+    *)
+        log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
+        exit 1
+        ;;
+esac
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Evaluation Complete"
+log_info "Results saved to: $RUN_DIR"
+log_info "Contents:"
+ls -1 "$RUN_DIR" | sed 's/^/  /'
+
+# Parse and display results summary
+log_header "Results Summary"
+for logfile in "$RUN_DIR"/*.log; do
+    [[ -f "$logfile" ]] || continue
+    local_name="$(basename "$logfile" .log)"
+    echo ""
+    echo "  --- $local_name ---"
+    # Try to extract pass rates from common output formats
+    grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/  /'
+done
+echo ""
--- a/scripts/agentic/setup.sh
+++ b/scripts/agentic/setup.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Setup agentic evaluation tools
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+log_header "Agentic Evaluation Setup"
+
+# ── Python virtual environment ───────────────────────────
+VENV_DIR="$(data_dir venv)"
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_info "Creating Python virtual environment..."
+    python3 -m venv "$VENV_DIR"
+    log_success "Virtual environment created at $VENV_DIR"
+fi
+
+source "$VENV_DIR/bin/activate"
+log_info "Python: $(python3 --version) from $VENV_DIR"
+
+# ── Install evaluation frameworks ────────────────────────
+
+# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
+if python3 -c "import inspect_ai" 2>/dev/null; then
+    log_success "inspect-ai already installed"
+else
+    log_info "Installing inspect-ai (main eval framework)..."
+    pip install inspect-ai 2>&1 | tail -3
+    log_success "inspect-ai installed"
+fi
+
+# EvalPlus — HumanEval+ and MBPP+ with native ollama support
+if python3 -c "import evalplus" 2>/dev/null; then
+    log_success "evalplus already installed"
+else
+    log_info "Installing evalplus (code generation benchmarks)..."
+    pip install evalplus 2>&1 | tail -3
+    log_success "evalplus installed"
+fi
+
+# BigCodeBench
+if python3 -c "import bigcodebench" 2>/dev/null; then
+    log_success "bigcodebench already installed"
+else
+    log_info "Installing bigcodebench..."
+    pip install bigcodebench 2>&1 | tail -3
+    log_success "bigcodebench installed"
+fi
+
+# ── Check for local LLM server ──────────────────────────
+log_header "LLM Server Check"
+
+ollama_ok=false
+llamacpp_ok=false
+
+if is_cmd ollama; then
+    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+        log_success "ollama running at localhost:11434"
+        ollama_ok=true
+        # List available models
+        log_info "Available ollama models:"
+        ollama list 2>/dev/null | head -10 || true
+    else
+        log_warn "ollama installed but not running. Start with: ollama serve"
+    fi
+else
+    log_info "ollama not installed — needed for most agentic benchmarks"
+    log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
+fi
+
+# Check for llama.cpp server
+if curl -s http://localhost:8080/health >/dev/null 2>&1; then
+    log_success "llama.cpp server running at localhost:8080"
+    llamacpp_ok=true
+else
+    log_info "No llama.cpp server detected at localhost:8080"
+    log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
+fi
+
+if ! $ollama_ok && ! $llamacpp_ok; then
+    log_warn "No local LLM server running. Agentic benchmarks need one."
+fi
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Setup Complete"
+echo ""
+echo "  Installed tools:"
+echo "    inspect-ai     — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
+echo "    evalplus        — HumanEval+ / MBPP+ with native ollama support"
+echo "    bigcodebench    — 1,140 coding tasks across 139 libraries"
+echo ""
+echo "  To activate the virtual environment:"
+echo "    source data/venv/bin/activate"
+echo ""
+echo "  Run evaluations:"
+echo "    make agentic-quick      # EvalPlus + IFEval (~1 hour)"
+echo "    make agentic-full       # BFCL + BigCodeBench (~3-4 hours)"
+echo ""
--- a/scripts/benchmark/setup.sh
+++ b/scripts/benchmark/setup.sh
@@ -8,13 +8,13 @@ source "$SCRIPT_DIR/../../lib/detect.sh"

 TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes"
 MODEL_DIR="$(data_dir models)"
+MODEL_CATALOG="$PROJECT_ROOT/configs/models.conf"

 log_header "Benchmark Setup"

 # ── 1. Check toolbox containers ──────────────────────────
 log_info "Checking toolbox containers..."

-# Minimum required: vulkan-radv (most stable)
 REQUIRED_TOOLBOXES=("llama-vulkan-radv")
 OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-rocm-7.2" "llama-vulkan-amdvlk")

@@ -22,7 +22,7 @@ existing=$(detect_toolbox_names 2>/dev/null || true)
 missing=()

 for tb in "${REQUIRED_TOOLBOXES[@]}"; do
-    if echo "$existing" | grep -q "^${tb}$"; then
+    if echo "$existing" | grep -qF "$tb"; then
        log_success "Toolbox: $tb"
    else
        missing+=("$tb")
@@ -31,7 +31,7 @@ for tb in "${REQUIRED_TOOLBOXES[@]}"; do
 done

 for tb in "${OPTIONAL_TOOLBOXES[@]}"; do
-    if echo "$existing" | grep -q "^${tb}$"; then
+    if echo "$existing" | grep -qF "$tb"; then
        log_success "Toolbox: $tb (optional)"
    else
        log_info "Toolbox not present: $tb (optional)"
@@ -80,26 +80,54 @@ if (( model_count > 0 )); then
    done
 else
    log_warn "No GGUF models found in $MODEL_DIR"
-    log_info "Download a test model. Example:"
+fi
+
+# ── 4. Show model catalog ───────────────────────────────
+log_header "Model Catalog"
+log_info "Available models (from configs/models.conf):"
+echo ""
+printf "  ${BOLD}%-28s %-10s %-8s %s${RESET}\n" "Name" "Category" "Size" "Description"
+echo "  $(printf '%.0s─' {1..70})"
+while IFS='|' read -r name repo file size_gb category desc; do
+    [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
+    local_file="$MODEL_DIR/$file"
+    status=" "
+    if [[ -f "$local_file" ]]; then
+        status="*"
+    fi
+    printf "  %s %-27s %-10s %4s GB  %s\n" "$status" "$name" "$category" "$size_gb" "$desc"
+done < "$MODEL_CATALOG"
+echo ""
+echo "  (* = downloaded)"
+echo ""
+
+# ── 5. Offer downloads ──────────────────────────────────
+if is_cmd huggingface-cli; then
+    log_info "Download models with:"
    echo ""
-    echo "  # Small (4B, ~3 GB):"
-    echo "  huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \\"
-    echo "    --local-dir $MODEL_DIR"
+    echo "  # Recommended starter set:"
+    echo "  # Smoke test (3 GB):"
+    echo "  huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf --local-dir $MODEL_DIR"
    echo ""
-    echo "  # Medium (14B, ~9 GB):"
-    echo "  huggingface-cli download Qwen/Qwen3-14B-GGUF Qwen3-14B-Q4_K_M.gguf \\"
-    echo "    --local-dir $MODEL_DIR"
+    echo "  # Top pick — Qwen3.5-35B-A3B MoE Q8 (37 GB, ~85 t/s gen):"
+    echo "  huggingface-cli download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q8_0.gguf --local-dir $MODEL_DIR"
+    echo ""
+    echo "  # Agentic/coding — Qwen3-Coder-30B-A3B (18 GB, best for tool use):"
+    echo "  huggingface-cli download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf --local-dir $MODEL_DIR"
+    echo ""
+    echo "  # Or download any model from catalog:"
+    echo "  # huggingface-cli download REPO FILE --local-dir $MODEL_DIR"
    echo ""

-    if is_cmd huggingface-cli; then
-        if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as test model?"; then
-            huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
+    if (( model_count == 0 )); then
+        if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as smoke test model?"; then
+            huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
                --local-dir "$MODEL_DIR"
            log_success "Model downloaded"
        fi
-    else
-        log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
    fi
+else
+    log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
 fi

 log_header "Setup Complete"