feat: add Qwen3.5 model catalog and agentic evaluation framework

Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 00:20:23 +01:00
parent 71053997be
commit 58124cd657
11 changed files with 1354 additions and 16 deletions
--- a/scripts/agentic/setup.sh
+++ b/scripts/agentic/setup.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Setup agentic evaluation tools
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+
+log_header "Agentic Evaluation Setup"
+
+# ── Python virtual environment ───────────────────────────
+VENV_DIR="$(data_dir venv)"
+if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
+    log_info "Creating Python virtual environment..."
+    python3 -m venv "$VENV_DIR"
+    log_success "Virtual environment created at $VENV_DIR"
+fi
+
+source "$VENV_DIR/bin/activate"
+log_info "Python: $(python3 --version) from $VENV_DIR"
+
+# ── Install evaluation frameworks ────────────────────────
+
+# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
+if python3 -c "import inspect_ai" 2>/dev/null; then
+    log_success "inspect-ai already installed"
+else
+    log_info "Installing inspect-ai (main eval framework)..."
+    pip install inspect-ai 2>&1 | tail -3
+    log_success "inspect-ai installed"
+fi
+
+# EvalPlus — HumanEval+ and MBPP+ with native ollama support
+if python3 -c "import evalplus" 2>/dev/null; then
+    log_success "evalplus already installed"
+else
+    log_info "Installing evalplus (code generation benchmarks)..."
+    pip install evalplus 2>&1 | tail -3
+    log_success "evalplus installed"
+fi
+
+# BigCodeBench
+if python3 -c "import bigcodebench" 2>/dev/null; then
+    log_success "bigcodebench already installed"
+else
+    log_info "Installing bigcodebench..."
+    pip install bigcodebench 2>&1 | tail -3
+    log_success "bigcodebench installed"
+fi
+
+# ── Check for local LLM server ──────────────────────────
+log_header "LLM Server Check"
+
+ollama_ok=false
+llamacpp_ok=false
+
+if is_cmd ollama; then
+    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+        log_success "ollama running at localhost:11434"
+        ollama_ok=true
+        # List available models
+        log_info "Available ollama models:"
+        ollama list 2>/dev/null | head -10 || true
+    else
+        log_warn "ollama installed but not running. Start with: ollama serve"
+    fi
+else
+    log_info "ollama not installed — needed for most agentic benchmarks"
+    log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
+fi
+
+# Check for llama.cpp server
+if curl -s http://localhost:8080/health >/dev/null 2>&1; then
+    log_success "llama.cpp server running at localhost:8080"
+    llamacpp_ok=true
+else
+    log_info "No llama.cpp server detected at localhost:8080"
+    log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
+fi
+
+if ! $ollama_ok && ! $llamacpp_ok; then
+    log_warn "No local LLM server running. Agentic benchmarks need one."
+fi
+
+# ── Summary ──────────────────────────────────────────────
+log_header "Setup Complete"
+echo ""
+echo "  Installed tools:"
+echo "    inspect-ai     — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
+echo "    evalplus        — HumanEval+ / MBPP+ with native ollama support"
+echo "    bigcodebench    — 1,140 coding tasks across 139 libraries"
+echo ""
+echo "  To activate the virtual environment:"
+echo "    source data/venv/bin/activate"
+echo ""
+echo "  Run evaluations:"
+echo "    make agentic-quick      # EvalPlus + IFEval (~1 hour)"
+echo "    make agentic-full       # BFCL + BigCodeBench (~3-4 hours)"
+echo ""