feat: add Qwen3.5 model catalog and agentic evaluation framework
Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
180
scripts/agentic/run-eval.sh
Normal file
180
scripts/agentic/run-eval.sh
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run agentic evaluations against a local LLM server
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
|
||||
VENV_DIR="$(data_dir venv)"
|
||||
EVAL_DIR="$(data_dir evals)"
|
||||
|
||||
# ── Argument parsing ─────────────────────────────────────
|
||||
SUITE="quick"
|
||||
MODEL=""
|
||||
ENDPOINT="http://localhost:11434/v1" # ollama default OpenAI-compat endpoint
|
||||
PROVIDER="openai"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--suite|-s) SUITE="$2"; shift 2 ;;
|
||||
--model|-m) MODEL="$2"; shift 2 ;;
|
||||
--endpoint|-e) ENDPOINT="$2"; shift 2 ;;
|
||||
--help|-h)
|
||||
echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]"
|
||||
echo ""
|
||||
echo "Suites:"
|
||||
echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)"
|
||||
echo " code EvalPlus + BigCodeBench (~2-3 hours)"
|
||||
echo " tooluse BFCL function calling (~1-2 hours)"
|
||||
echo " full All of the above (~5-6 hours)"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --model Model name as known by the server (e.g., qwen3.5:35b-a3b)"
|
||||
echo " --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)"
|
||||
exit 0 ;;
|
||||
*) log_warn "Unknown argument: $1"; shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── Validation ───────────────────────────────────────────
|
||||
if [[ -z "$MODEL" ]]; then
|
||||
log_error "Model name required. Use --model NAME"
|
||||
log_info "Examples:"
|
||||
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
||||
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||
log_error "Virtual environment not found. Run: make agentic-setup"
|
||||
exit 1
|
||||
fi
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# Check server is reachable
|
||||
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
||||
# Try ollama native endpoint
|
||||
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
||||
log_info "Ollama detected, using OpenAI-compat endpoint"
|
||||
else
|
||||
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
TS="$(timestamp)"
|
||||
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
log_header "Agentic Evaluation: $SUITE"
|
||||
log_info "Model: $MODEL"
|
||||
log_info "Endpoint: $ENDPOINT"
|
||||
log_info "Results: $RUN_DIR"
|
||||
|
||||
# Save run metadata
|
||||
cat > "$RUN_DIR/metadata.json" << ENDJSON
|
||||
{
|
||||
"suite": "$SUITE",
|
||||
"model": "$MODEL",
|
||||
"endpoint": "$ENDPOINT",
|
||||
"timestamp": "$TS",
|
||||
"hostname": "$(hostname)"
|
||||
}
|
||||
ENDJSON
|
||||
|
||||
# ── Start metric logging ────────────────────────────────
|
||||
METRICS_FILE="$RUN_DIR/metrics.csv"
|
||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
||||
METRICS_PID=$!
|
||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
|
||||
|
||||
# ── Suite execution ──────────────────────────────────────
|
||||
|
||||
run_evalplus() {
|
||||
local bench="$1" # humaneval or mbpp
|
||||
log_info "Running EvalPlus $bench..."
|
||||
local out="$RUN_DIR/evalplus-${bench}.json"
|
||||
|
||||
OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \
|
||||
--model "$MODEL" \
|
||||
--backend openai \
|
||||
--dataset "$bench" \
|
||||
--greedy \
|
||||
2>&1 | tee "$RUN_DIR/evalplus-${bench}.log"
|
||||
|
||||
# Copy results if generated
|
||||
local result_dir="$HOME/.evalplus/${MODEL}/${bench}"
|
||||
if [[ -d "$result_dir" ]]; then
|
||||
cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true
|
||||
fi
|
||||
log_success "EvalPlus $bench complete"
|
||||
}
|
||||
|
||||
run_inspect_eval() {
|
||||
local eval_name="$1"
|
||||
local display_name="$2"
|
||||
log_info "Running Inspect AI: $display_name..."
|
||||
local out="$RUN_DIR/inspect-${eval_name}.json"
|
||||
|
||||
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
||||
inspect eval "$eval_name" \
|
||||
--model "openai/$MODEL" \
|
||||
--log-dir "$RUN_DIR/inspect-logs/" \
|
||||
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
|
||||
|
||||
log_success "Inspect $display_name complete"
|
||||
}
|
||||
|
||||
run_bigcodebench() {
|
||||
log_info "Running BigCodeBench..."
|
||||
OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \
|
||||
--model "$MODEL" \
|
||||
--backend openai \
|
||||
--subset complete \
|
||||
2>&1 | tee "$RUN_DIR/bigcodebench.log"
|
||||
log_success "BigCodeBench complete"
|
||||
}
|
||||
|
||||
case "$SUITE" in
|
||||
quick)
|
||||
run_evalplus "humaneval"
|
||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
||||
;;
|
||||
code)
|
||||
run_evalplus "humaneval"
|
||||
run_evalplus "mbpp"
|
||||
run_bigcodebench
|
||||
;;
|
||||
tooluse)
|
||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
||||
;;
|
||||
full)
|
||||
run_evalplus "humaneval"
|
||||
run_evalplus "mbpp"
|
||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
||||
run_bigcodebench
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────
|
||||
log_header "Evaluation Complete"
|
||||
log_info "Results saved to: $RUN_DIR"
|
||||
log_info "Contents:"
|
||||
ls -1 "$RUN_DIR" | sed 's/^/ /'
|
||||
|
||||
# Parse and display results summary
|
||||
log_header "Results Summary"
|
||||
for logfile in "$RUN_DIR"/*.log; do
|
||||
[[ -f "$logfile" ]] || continue
|
||||
local_name="$(basename "$logfile" .log)"
|
||||
echo ""
|
||||
echo " --- $local_name ---"
|
||||
# Try to extract pass rates from common output formats
|
||||
grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/ /'
|
||||
done
|
||||
echo ""
|
||||
98
scripts/agentic/setup.sh
Normal file
98
scripts/agentic/setup.sh
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env bash
|
||||
# Setup agentic evaluation tools
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
|
||||
log_header "Agentic Evaluation Setup"
|
||||
|
||||
# ── Python virtual environment ───────────────────────────
|
||||
VENV_DIR="$(data_dir venv)"
|
||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||
log_info "Creating Python virtual environment..."
|
||||
python3 -m venv "$VENV_DIR"
|
||||
log_success "Virtual environment created at $VENV_DIR"
|
||||
fi
|
||||
|
||||
source "$VENV_DIR/bin/activate"
|
||||
log_info "Python: $(python3 --version) from $VENV_DIR"
|
||||
|
||||
# ── Install evaluation frameworks ────────────────────────
|
||||
|
||||
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
|
||||
if python3 -c "import inspect_ai" 2>/dev/null; then
|
||||
log_success "inspect-ai already installed"
|
||||
else
|
||||
log_info "Installing inspect-ai (main eval framework)..."
|
||||
pip install inspect-ai 2>&1 | tail -3
|
||||
log_success "inspect-ai installed"
|
||||
fi
|
||||
|
||||
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
|
||||
if python3 -c "import evalplus" 2>/dev/null; then
|
||||
log_success "evalplus already installed"
|
||||
else
|
||||
log_info "Installing evalplus (code generation benchmarks)..."
|
||||
pip install evalplus 2>&1 | tail -3
|
||||
log_success "evalplus installed"
|
||||
fi
|
||||
|
||||
# BigCodeBench
|
||||
if python3 -c "import bigcodebench" 2>/dev/null; then
|
||||
log_success "bigcodebench already installed"
|
||||
else
|
||||
log_info "Installing bigcodebench..."
|
||||
pip install bigcodebench 2>&1 | tail -3
|
||||
log_success "bigcodebench installed"
|
||||
fi
|
||||
|
||||
# ── Check for local LLM server ──────────────────────────
|
||||
log_header "LLM Server Check"
|
||||
|
||||
ollama_ok=false
|
||||
llamacpp_ok=false
|
||||
|
||||
if is_cmd ollama; then
|
||||
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||
log_success "ollama running at localhost:11434"
|
||||
ollama_ok=true
|
||||
# List available models
|
||||
log_info "Available ollama models:"
|
||||
ollama list 2>/dev/null | head -10 || true
|
||||
else
|
||||
log_warn "ollama installed but not running. Start with: ollama serve"
|
||||
fi
|
||||
else
|
||||
log_info "ollama not installed — needed for most agentic benchmarks"
|
||||
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
|
||||
fi
|
||||
|
||||
# Check for llama.cpp server
|
||||
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
|
||||
log_success "llama.cpp server running at localhost:8080"
|
||||
llamacpp_ok=true
|
||||
else
|
||||
log_info "No llama.cpp server detected at localhost:8080"
|
||||
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
|
||||
fi
|
||||
|
||||
if ! $ollama_ok && ! $llamacpp_ok; then
|
||||
log_warn "No local LLM server running. Agentic benchmarks need one."
|
||||
fi
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────
|
||||
log_header "Setup Complete"
|
||||
echo ""
|
||||
echo " Installed tools:"
|
||||
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
|
||||
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
||||
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
||||
echo ""
|
||||
echo " To activate the virtual environment:"
|
||||
echo " source data/venv/bin/activate"
|
||||
echo ""
|
||||
echo " Run evaluations:"
|
||||
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
|
||||
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
|
||||
echo ""
|
||||
@@ -8,13 +8,13 @@ source "$SCRIPT_DIR/../../lib/detect.sh"
|
||||
|
||||
TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes"
|
||||
MODEL_DIR="$(data_dir models)"
|
||||
MODEL_CATALOG="$PROJECT_ROOT/configs/models.conf"
|
||||
|
||||
log_header "Benchmark Setup"
|
||||
|
||||
# ── 1. Check toolbox containers ──────────────────────────
|
||||
log_info "Checking toolbox containers..."
|
||||
|
||||
# Minimum required: vulkan-radv (most stable)
|
||||
REQUIRED_TOOLBOXES=("llama-vulkan-radv")
|
||||
OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-rocm-7.2" "llama-vulkan-amdvlk")
|
||||
|
||||
@@ -22,7 +22,7 @@ existing=$(detect_toolbox_names 2>/dev/null || true)
|
||||
missing=()
|
||||
|
||||
for tb in "${REQUIRED_TOOLBOXES[@]}"; do
|
||||
if echo "$existing" | grep -q "^${tb}$"; then
|
||||
if echo "$existing" | grep -qF "$tb"; then
|
||||
log_success "Toolbox: $tb"
|
||||
else
|
||||
missing+=("$tb")
|
||||
@@ -31,7 +31,7 @@ for tb in "${REQUIRED_TOOLBOXES[@]}"; do
|
||||
done
|
||||
|
||||
for tb in "${OPTIONAL_TOOLBOXES[@]}"; do
|
||||
if echo "$existing" | grep -q "^${tb}$"; then
|
||||
if echo "$existing" | grep -qF "$tb"; then
|
||||
log_success "Toolbox: $tb (optional)"
|
||||
else
|
||||
log_info "Toolbox not present: $tb (optional)"
|
||||
@@ -80,26 +80,54 @@ if (( model_count > 0 )); then
|
||||
done
|
||||
else
|
||||
log_warn "No GGUF models found in $MODEL_DIR"
|
||||
log_info "Download a test model. Example:"
|
||||
fi
|
||||
|
||||
# ── 4. Show model catalog ───────────────────────────────
|
||||
log_header "Model Catalog"
|
||||
log_info "Available models (from configs/models.conf):"
|
||||
echo ""
|
||||
printf " ${BOLD}%-28s %-10s %-8s %s${RESET}\n" "Name" "Category" "Size" "Description"
|
||||
echo " $(printf '%.0s─' {1..70})"
|
||||
while IFS='|' read -r name repo file size_gb category desc; do
|
||||
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
|
||||
local_file="$MODEL_DIR/$file"
|
||||
status=" "
|
||||
if [[ -f "$local_file" ]]; then
|
||||
status="*"
|
||||
fi
|
||||
printf " %s %-27s %-10s %4s GB %s\n" "$status" "$name" "$category" "$size_gb" "$desc"
|
||||
done < "$MODEL_CATALOG"
|
||||
echo ""
|
||||
echo " (* = downloaded)"
|
||||
echo ""
|
||||
|
||||
# ── 5. Offer downloads ──────────────────────────────────
|
||||
if is_cmd huggingface-cli; then
|
||||
log_info "Download models with:"
|
||||
echo ""
|
||||
echo " # Small (4B, ~3 GB):"
|
||||
echo " huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \\"
|
||||
echo " --local-dir $MODEL_DIR"
|
||||
echo " # Recommended starter set:"
|
||||
echo " # Smoke test (3 GB):"
|
||||
echo " huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf --local-dir $MODEL_DIR"
|
||||
echo ""
|
||||
echo " # Medium (14B, ~9 GB):"
|
||||
echo " huggingface-cli download Qwen/Qwen3-14B-GGUF Qwen3-14B-Q4_K_M.gguf \\"
|
||||
echo " --local-dir $MODEL_DIR"
|
||||
echo " # Top pick — Qwen3.5-35B-A3B MoE Q8 (37 GB, ~85 t/s gen):"
|
||||
echo " huggingface-cli download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q8_0.gguf --local-dir $MODEL_DIR"
|
||||
echo ""
|
||||
echo " # Agentic/coding — Qwen3-Coder-30B-A3B (18 GB, best for tool use):"
|
||||
echo " huggingface-cli download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf --local-dir $MODEL_DIR"
|
||||
echo ""
|
||||
echo " # Or download any model from catalog:"
|
||||
echo " # huggingface-cli download REPO FILE --local-dir $MODEL_DIR"
|
||||
echo ""
|
||||
|
||||
if is_cmd huggingface-cli; then
|
||||
if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as test model?"; then
|
||||
huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
|
||||
if (( model_count == 0 )); then
|
||||
if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as smoke test model?"; then
|
||||
huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
|
||||
--local-dir "$MODEL_DIR"
|
||||
log_success "Model downloaded"
|
||||
fi
|
||||
else
|
||||
log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
|
||||
fi
|
||||
else
|
||||
log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
|
||||
fi
|
||||
|
||||
log_header "Setup Complete"
|
||||
|
||||
Reference in New Issue
Block a user