Compare commits
4 Commits
dd403a907c
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c847991740 | ||
|
|
15bb6a8ed9 | ||
|
|
474d94a07e | ||
|
|
6ab08537ca |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,6 +1,9 @@
|
|||||||
data/
|
data/
|
||||||
|
.venv/
|
||||||
*.log
|
*.log
|
||||||
*.csv
|
*.csv
|
||||||
*.tmp
|
*.tmp
|
||||||
.claude/
|
.claude/
|
||||||
.idea/
|
.idea/
|
||||||
|
evalplus_results/
|
||||||
|
ztop/
|
||||||
|
|||||||
14
CLAUDE.md
14
CLAUDE.md
@@ -41,9 +41,21 @@ make verify # 9-point optimization checklist
|
|||||||
bin/audit --json | python3 -m json.tool # Verify JSON output is valid
|
bin/audit --json | python3 -m json.tool # Verify JSON output is valid
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Serving
|
||||||
|
|
||||||
|
`scripts/serve/launch.sh` with dispatcher at `bin/serve`. Launches llama-server inside toolbox containers with optimized defaults: Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload. Key flags:
|
||||||
|
- `--ngram` — n-gram speculative decoding (~1.1-1.4x tg for repetitive content)
|
||||||
|
- `--no-think` — disables thinking/reasoning via `--reasoning-budget 0` (faster for evals)
|
||||||
|
- `--ctx N` — context size (default 131072)
|
||||||
|
- `--parallel N` — concurrent request slots
|
||||||
|
|
||||||
|
## System Tuning
|
||||||
|
|
||||||
|
`scripts/optimize/power-profile.sh` applies Phase 2 optimizations: RyzenAdj PPT increase (85W target, HP caps at 70W sustained), sysctl tuning (vm.swappiness=1, vm.max_map_count=500000), THP=always, RADV_PERFTEST=nogttspill. Systemd services for boot/resume persistence at `configs/ryzenadj-llm.service` and `configs/ryzenadj-resume.service`.
|
||||||
|
|
||||||
## Agentic Evaluation
|
## Agentic Evaluation
|
||||||
|
|
||||||
Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `data/venv/`. Eval frameworks: inspect-ai (all-in-one), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint (ollama or llama.cpp server). Model catalog at `configs/models.conf`.
|
Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `.venv/` (Python 3.13, dependencies in `requirements.txt`). Eval frameworks: inspect-ai (all-in-one), inspect-evals (task definitions), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint — auto-detects llama-server (port 8080) or ollama (port 11434). Model catalog at `configs/models.conf`.
|
||||||
|
|
||||||
## External Resources
|
## External Resources
|
||||||
|
|
||||||
|
|||||||
17
Makefile
17
Makefile
@@ -39,12 +39,27 @@ benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare
|
|||||||
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
||||||
|
|
||||||
# --- Serve ---
|
# --- Serve ---
|
||||||
serve: ## Launch llama-server with optimized settings (ARGS="-m MODEL.gguf")
|
serve: ## Launch APEX I-Compact daily driver (2 slots, 256K ctx)
|
||||||
|
@bash bin/serve -m Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf --parallel 2 --ctx 262144 $(ARGS)
|
||||||
|
|
||||||
|
serve-custom: ## Launch llama-server with custom model (ARGS="-m MODEL.gguf")
|
||||||
@bash bin/serve $(ARGS)
|
@bash bin/serve $(ARGS)
|
||||||
|
|
||||||
serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
|
serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
|
||||||
@bash bin/serve --ngram $(ARGS)
|
@bash bin/serve --ngram $(ARGS)
|
||||||
|
|
||||||
|
flush-gpu: ## Kill llama-server/bench processes and drop kernel caches to free unified VRAM
|
||||||
|
-@pkill -x llama-server 2>/dev/null || true
|
||||||
|
-@pkill -x llama-bench 2>/dev/null || true
|
||||||
|
-@pkill -x llama-cli 2>/dev/null || true
|
||||||
|
-@podman ps --filter name=llama --format '{{.Names}}' | xargs -r podman stop
|
||||||
|
@sync && sudo sysctl vm.drop_caches=3
|
||||||
|
@echo "VRAM usage:" && cat /sys/class/drm/card*/device/mem_info_vram_used 2>/dev/null | awk '{printf " %.2f MiB\n", $$1/1048576}'
|
||||||
|
|
||||||
|
# --- Hardware Info ---
|
||||||
|
hw-bandwidth: ## Measure GPU memory bandwidth and compute (clpeak)
|
||||||
|
@clpeak 2>&1
|
||||||
|
|
||||||
# --- Optimize ---
|
# --- Optimize ---
|
||||||
optimize: ## Interactive optimization walkthrough
|
optimize: ## Interactive optimization walkthrough
|
||||||
@bash bin/optimize --all
|
@bash bin/optimize --all
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ case "${1:-help}" in
|
|||||||
echo " --category LIST Comma-separated: smoke,dense,moe"
|
echo " --category LIST Comma-separated: smoke,dense,moe"
|
||||||
echo " --skip-longctx Skip long-context (32K) tests"
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
|
|||||||
@@ -2,28 +2,39 @@
|
|||||||
# Format: NAME|HF_REPO|FILE|SIZE_GB|CATEGORY|DESCRIPTION
|
# Format: NAME|HF_REPO|FILE|SIZE_GB|CATEGORY|DESCRIPTION
|
||||||
#
|
#
|
||||||
# Categories: smoke, standard, moe, dense
|
# Categories: smoke, standard, moe, dense
|
||||||
# Download with: huggingface-cli download REPO FILE --local-dir /data/models/llms/REPO
|
# Download with: hf download REPO FILE --local-dir /data/models/llms/REPO
|
||||||
|
|
||||||
# ── Smoke tests (quick, small) ───────────────────────────
|
# ── Smoke tests (quick, small) ───────────────────────────
|
||||||
qwen3.5-0.8b-q8|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|smoke|Tiny, Q8 full precision
|
qwen2.5-0.5b-q8|lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF|Qwen2.5-0.5B-Instruct-Q8_0.gguf|0.4|smoke|Tiny Qwen2.5, Q8
|
||||||
|
qwen3.5-0.8b-q8|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|smoke|Tiny Qwen3.5, Q8
|
||||||
qwen3.5-2b-q4|unsloth/Qwen3.5-2B-GGUF|Qwen3.5-2B-Q4_K_S.gguf|1.2|smoke|Small dense 2B
|
qwen3.5-2b-q4|unsloth/Qwen3.5-2B-GGUF|Qwen3.5-2B-Q4_K_S.gguf|1.2|smoke|Small dense 2B
|
||||||
qwen3.5-4b-q4|unsloth/Qwen3.5-4B-GGUF|Qwen3.5-4B-Q4_K_S.gguf|2.5|smoke|Small dense 4B
|
qwen3.5-4b-q4|unsloth/Qwen3.5-4B-GGUF|Qwen3.5-4B-Q4_K_S.gguf|2.5|smoke|Small dense 4B
|
||||||
|
|
||||||
# ── Standard dense models ────────────────────────────────
|
# ── Standard dense models ────────────────────────────────
|
||||||
qwen3.5-9b-q4|unsloth/Qwen3.5-9B-GGUF|Qwen3.5-9B-Q4_K_S.gguf|5.1|dense|Dense 9B
|
qwen3.5-9b-q4|unsloth/Qwen3.5-9B-GGUF|Qwen3.5-9B-Q4_K_S.gguf|5.1|dense|Dense 9B
|
||||||
gpt-oss-20b-mxfp4|lmstudio-community/gpt-oss-20b-GGUF|gpt-oss-20b-MXFP4.gguf|12|dense|GPT-OSS 20B MXFP4
|
glm-4.7-flash-q6|unsloth/GLM-4.7-Flash-GGUF|GLM-4.7-Flash-UD-Q6_K_XL.gguf|24|moe|GLM 4.7 Flash, UD Q6 (MoE 30B, 3B active)
|
||||||
glm-4.7-flash-q6|lmstudio-community/GLM-4.7-Flash-GGUF|GLM-4.7-Flash-Q6_K.gguf|23|dense|GLM 4.7 Flash Q6
|
|
||||||
|
|
||||||
# ── Qwen3.5-27B dense (download needed) ─────────────────
|
# ── Gemma 4 ────────────────────────────────────────────
|
||||||
|
gemma-4-26b-a4b-q6xl|unsloth/gemma-4-26B-A4B-it-GGUF|gemma-4-26B-A4B-it-UD-Q6_K_XL.gguf|22|moe|Gemma 4 MoE 26B, 4B active, UD Q6 XL
|
||||||
|
gemma-4-26b-a4b-q4s|unsloth/gemma-4-26B-A4B-it-GGUF|gemma-4-26B-A4B-it-UD-Q4_K_S.gguf|15|moe|Gemma 4 MoE 26B, 4B active, UD Q4
|
||||||
|
gemma-4-31b-q3xl|unsloth/gemma-4-31B-it-GGUF|gemma-4-31B-it-UD-Q3_K_XL.gguf|14|dense|Gemma 4 dense 31B, UD Q3 XL
|
||||||
|
|
||||||
|
# ── Qwen3.5-27B dense ──────────────────────────────────
|
||||||
qwen3.5-27b-q4|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q4_K_M.gguf|17|dense|Dense 27B, quality-first
|
qwen3.5-27b-q4|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q4_K_M.gguf|17|dense|Dense 27B, quality-first
|
||||||
|
qwen3.5-27b-opus-distill|Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF|Qwen3.5-27B.Q4_K_M.gguf|15|dense|Dense 27B, Claude Opus reasoning distilled v2
|
||||||
|
|
||||||
# ── MoE models (fast generation, best for 64GB) ─────────
|
# ── MoE models (fast generation, best for 64GB) ─────────
|
||||||
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
||||||
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
||||||
|
qwen3.5-35b-a3b-apex-compact|mudler/Qwen3.5-35B-A3B-Claude-Distilled-APEX-GGUF|Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf|17|moe|MoE 35B Claude-distilled APEX, I-Compact quant
|
||||||
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
||||||
|
|
||||||
# ── Coding models ─────────────────────────────────────────
|
# ── Coding models ─────────────────────────────────────────
|
||||||
qwen3-coder-30b-a3b-q6|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf|26|moe|Agentic coding MoE, pure Transformer
|
qwen3-coder-30b-a3b-q6|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf|26|moe|Agentic coding MoE, pure Transformer
|
||||||
|
qwen3-coder-next-q3|unsloth/Qwen3-Coder-Next-GGUF|Qwen3-Coder-Next-UD-Q3_K_XL.gguf|34|moe|80B MoE coder, >70% SWE-bench, hybrid DeltaNet
|
||||||
|
|
||||||
|
# ── Pruned MoE (REAP expert pruning) ─────────────────────
|
||||||
|
qwen3.5-122b-a10b-reap40-q4|0xSero/Qwen3.5-122B-A10B-REAP-40-GGUF|Qwen3.5-122B-A10B-REAP-40-Q4_K_M.gguf|46|moe|122B MoE pruned to 40 experts, 10B active, Q4_K_M
|
||||||
|
|
||||||
# ── Draft models (speculative decoding) ───────────────────
|
# ── Draft models (speculative decoding) ───────────────────
|
||||||
qwen3.5-0.8b-q8-draft|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|draft|Draft for Qwen3.5 speculative decoding
|
qwen3.5-0.8b-q8-draft|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|draft|Draft for Qwen3.5 speculative decoding
|
||||||
|
|||||||
706
docs/agentic-coding-evaluation-landscape.md
Normal file
706
docs/agentic-coding-evaluation-landscape.md
Normal file
@@ -0,0 +1,706 @@
|
|||||||
|
# Agentic Coding Evaluation Landscape
|
||||||
|
|
||||||
|
Comprehensive research into the dimensions, benchmarks, and model performance for
|
||||||
|
evaluating LLMs in software engineering agent use cases. Research date: 2026-03-30.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Evaluation Taxonomy](#1-evaluation-taxonomy)
|
||||||
|
2. [Dimension 1: Code Generation Accuracy](#2-code-generation-accuracy)
|
||||||
|
3. [Dimension 2: Code Editing / Patching](#3-code-editing--patching)
|
||||||
|
4. [Dimension 3: Tool Use / Function Calling](#4-tool-use--function-calling)
|
||||||
|
5. [Dimension 4: Multi-Step Planning](#5-multi-step-planning)
|
||||||
|
6. [Dimension 5: Debugging / Error Recovery](#6-debugging--error-recovery)
|
||||||
|
7. [Dimension 6: Repository Understanding](#7-repository-understanding)
|
||||||
|
8. [Dimension 7: Instruction Following](#8-instruction-following)
|
||||||
|
9. [Dimension 8: Long Context Utilization](#9-long-context-utilization)
|
||||||
|
10. [Dimension 9: Multi-Language Support](#10-multi-language-support)
|
||||||
|
11. [Dimension 10: Test Generation](#11-test-generation)
|
||||||
|
12. [Benchmark Suite Summary](#12-benchmark-suite-summary)
|
||||||
|
13. [Open-Weight Model Landscape for 64GB Systems](#13-open-weight-model-landscape-for-64gb-systems)
|
||||||
|
14. [Frontier vs. Open Model Gap](#14-frontier-vs-open-model-gap)
|
||||||
|
15. [Recommended Evaluation Stack](#15-recommended-evaluation-stack)
|
||||||
|
16. [Sources](#16-sources)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Evaluation Taxonomy
|
||||||
|
|
||||||
|
Recent survey work (CSLLM Survey, 2025; SE Agent Benchmark Survey, 2025) organizes
|
||||||
|
coding LLM evaluation along two orthogonal axes:
|
||||||
|
|
||||||
|
- **Capability dimension**: What is being measured (generation, editing, tool use,
|
||||||
|
planning, debugging, comprehension, instruction following, etc.)
|
||||||
|
- **Evaluation paradigm**: How it is measured (static benchmarks, execution-based
|
||||||
|
evaluation, agent-in-the-loop evaluation, human evaluation)
|
||||||
|
|
||||||
|
The field has moved decisively from static benchmarks (HumanEval, MBPP) toward
|
||||||
|
agent-in-the-loop evaluations (SWE-bench, Terminal-Bench, FeatureBench) that test
|
||||||
|
the full agentic loop: plan, act, observe, iterate. This shift matters because models
|
||||||
|
that score 95%+ on HumanEval can still fail below 50% on realistic agentic tasks.
|
||||||
|
|
||||||
|
The ten dimensions below map to the capability axis. Each dimension lists the
|
||||||
|
benchmarks that best isolate it, though in practice most agentic benchmarks test
|
||||||
|
multiple dimensions simultaneously.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Code Generation Accuracy
|
||||||
|
|
||||||
|
**Definition**: Writing correct, complete code from natural-language specifications or
|
||||||
|
docstrings, measured by functional correctness (pass@k on test suites).
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | Languages | Metric | Notes |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| **HumanEval** (Chen et al., 2021) | 164 | Python | pass@k | Foundational but near-saturated; best models >95% |
|
||||||
|
| **HumanEval+** / **MBPP+** (EvalPlus, NeurIPS 2023) | 164 / 399 | Python | pass@k (80x more tests) | Catches false positives from HumanEval; ~10-15% score drops |
|
||||||
|
| **HumanEval Pro** / **MBPP Pro** (ACL 2025) | 164 / 399 | Python | pass@k on self-invoking tasks | Tests compositional reasoning; o1-mini drops from 96.2% to 76.2% |
|
||||||
|
| **BigCodeBench** (ICLR 2025) | 1,140 | Python (139 libs) | pass@1 | Multi-tool, cross-domain; best model (GPT-4o) ~60% Complete, <50% Instruct |
|
||||||
|
| **BigCodeBench-Hard** | 148 | Python | pass@1 | Hardest subset; human performance 97%, LLMs ~60% |
|
||||||
|
| **LiveCodeBench** (EMNLP 2025) | Rolling | Python | pass@k | Contamination-free: new problems added continuously from competitive programming |
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
- **Frontier**: Claude Opus 4.5/4.6, GPT-5.2, Gemini 3.1 Pro all score >95% on
|
||||||
|
HumanEval, ~85% on HumanEval+, ~65% on BigCodeBench-Complete.
|
||||||
|
- **Open (64GB-feasible)**: Qwen3.5-27B-Q4 achieves ~80% on HumanEval+.
|
||||||
|
Qwen3-Coder-30B-A3B (3.3B active, ~18GB at Q4) is strong on BigCodeBench.
|
||||||
|
Qwen2.5-Coder-32B-Instruct matched GPT-4o on HumanEval when released.
|
||||||
|
|
||||||
|
### Key Insight
|
||||||
|
|
||||||
|
HumanEval is near-saturated and should no longer be used as a primary differentiator.
|
||||||
|
BigCodeBench and LiveCodeBench are the current gold standards for code generation
|
||||||
|
accuracy, as they test realistic multi-library tasks and resist contamination.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Code Editing / Patching
|
||||||
|
|
||||||
|
**Definition**: Modifying existing code correctly -- applying diffs, fixing bugs in
|
||||||
|
context, integrating new code into existing files -- rather than generating from scratch.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **Aider Code Editing** | 133 | Edit Python files to solve Exercism problems | Tests edit format compliance + coding ability |
|
||||||
|
| **Aider Polyglot** | 225 | Edit code across 6 languages with error feedback | Two attempts per problem; measures edit+debug loop |
|
||||||
|
| **Diff-XYZ** (Oct 2025) | 3 tasks | Apply, anti-apply, generate diffs | Tests diff understanding in multiple formats |
|
||||||
|
| **EDIT-Bench** | Varied | Real-world instructed code edits | Repository-level editing tasks |
|
||||||
|
| **SWE-bench** (indirectly) | 2,294 | Generate patches that resolve GitHub issues | Requires generating correct unified diffs |
|
||||||
|
|
||||||
|
### Edit Format Considerations
|
||||||
|
|
||||||
|
Code editing performance depends heavily on the edit format used:
|
||||||
|
|
||||||
|
- **Search/Replace blocks** (Aider default): Most reliable for most models
|
||||||
|
- **Unified diff**: GPT-4 Turbo was "3x less lazy" with unified diffs (Aider blog)
|
||||||
|
- **V4A diff format**: OpenAI's recommended format (published with GPT-4.1, April 2025)
|
||||||
|
- **Whole-file rewrite**: Simpler but wasteful; works with weaker models
|
||||||
|
|
||||||
|
Models that excel at generation can fail at editing because they struggle to produce
|
||||||
|
syntactically valid diffs or correctly locate the code to modify.
|
||||||
|
|
||||||
|
### State of the Art (Aider Polyglot, March 2026)
|
||||||
|
|
||||||
|
| Model | Score | Type |
|
||||||
|
|---|---|---|
|
||||||
|
| GPT-5 | 88.0% | Frontier |
|
||||||
|
| MiniMax M2.5 | 80.2% | Open |
|
||||||
|
| DeepSeek V3.2-Exp | 74.2% | Open |
|
||||||
|
| DeepSeek-R1-0528 | 71.4% | Open |
|
||||||
|
| GLM-4.5-FP8 | 66.0% | Open |
|
||||||
|
| Qwen3-Coder-480B | 61.8% | Open (too large for 64GB) |
|
||||||
|
| Qwen3-Coder-30B-A3B | ~55-60%* | Open (fits 64GB at Q4) |
|
||||||
|
|
||||||
|
*Estimated from quantized GGUF performance data; exact Aider Polyglot score for
|
||||||
|
the 30B-A3B variant not independently confirmed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Tool Use / Function Calling
|
||||||
|
|
||||||
|
**Definition**: Correctly invoking APIs, tools, or MCP servers -- selecting the right
|
||||||
|
function, constructing valid arguments, parsing responses, deciding when NOT to call.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **BFCL V4** (Berkeley) | Thousands | Function calling accuracy across formats | De facto standard; AST-based evaluation |
|
||||||
|
| **BFCL-v3** (via EvalScope) | Multi-turn | Stateful multi-step function calling | Tests memory and context management |
|
||||||
|
| **Nexus Function Calling** | Varied | Tool selection and invocation | Broader tool landscape |
|
||||||
|
| **IFEval-FC** (2025) | 500+ | Instruction following within function schemas | JSON schema constraint adherence |
|
||||||
|
| **tau-bench** | Varied | Tool-augmented task completion | End-to-end agent tool use |
|
||||||
|
|
||||||
|
### BFCL Key Findings
|
||||||
|
|
||||||
|
The Berkeley Function Calling Leaderboard reveals a critical split:
|
||||||
|
|
||||||
|
1. **Single-turn calls**: Most frontier models score >90% accuracy
|
||||||
|
2. **Multi-turn stateful calls**: Performance drops 20-40% even for top models
|
||||||
|
3. **Abstention**: Knowing when NOT to call a function remains a major weakness
|
||||||
|
4. **Long-horizon tool use**: Memory, dynamic decision-making, and context management
|
||||||
|
are open challenges
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
- **Frontier**: Claude Opus 4.5/4.6, GPT-5.2 lead overall BFCL V4
|
||||||
|
- **Open**: Qwen3-Coder-480B is "comparable to Claude Sonnet 4 on Agentic Tool-Use"
|
||||||
|
(Qwen team). For 64GB-feasible models, Qwen3-Coder-30B-A3B has a specially
|
||||||
|
designed function call format and strong tool-use training.
|
||||||
|
Nemotron 3 Super (120B, 12B active) was explicitly trained for tool-use workflows.
|
||||||
|
|
||||||
|
### Relevance to MCP
|
||||||
|
|
||||||
|
MCP (Model Context Protocol) servers expose tools via JSON schemas -- exactly what
|
||||||
|
BFCL tests. A model's BFCL score is a reasonable proxy for MCP tool-use competence,
|
||||||
|
though MCP adds discovery and session management complexity not yet benchmarked.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Multi-Step Planning
|
||||||
|
|
||||||
|
**Definition**: Breaking complex tasks into subtasks, maintaining coherent plans across
|
||||||
|
many steps, tracking progress, and adapting when plans fail.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | Steps | What It Tests | Notes |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| **SWE-bench Verified** | 500 | 5-50+ | End-to-end issue resolution | Gold standard for agentic coding |
|
||||||
|
| **SWE-bench Pro** (Scale AI) | Harder | 10-100+ | More complex issues | Best model ~46% (vs 81% on Verified) |
|
||||||
|
| **FeatureBench** (Feb 2026) | 200 | Many | Complex feature development | Claude 4.5 Opus: only 11.0% (vs 74.4% SWE-bench) |
|
||||||
|
| **Snorkel Agentic Coding** | 100 | Multi-step, 4 tiers | Plan, track, execute, recover | Claude Opus 4.5: 58%, Gemini 3 Pro: 51.6% |
|
||||||
|
| **GAIA** (ICLR 2025) | 450 | Multi-step | General assistant planning | Near saturation (~90% top scores) |
|
||||||
|
| **Gaia2** (2026) | Varied | Async | Dynamic, asynchronous environments | Adds temporal constraints and agent collaboration |
|
||||||
|
| **Terminal-Bench 2.0** | 89 | Multi-step | Terminal workflow completion | Tests plan execution in CLI environments |
|
||||||
|
|
||||||
|
### Planning-Specific Insights
|
||||||
|
|
||||||
|
The gap between SWE-bench Verified (~81% frontier) and SWE-bench Pro (~46% frontier)
|
||||||
|
and FeatureBench (~11% frontier) reveals that multi-step planning degrades rapidly
|
||||||
|
with task complexity:
|
||||||
|
|
||||||
|
- **SWE-bench Verified**: Often requires 5-15 steps (find file, understand bug, edit,
|
||||||
|
test)
|
||||||
|
- **SWE-bench Pro**: Requires deeper reasoning about architecture and dependencies
|
||||||
|
- **FeatureBench**: Requires implementing features across multiple files with
|
||||||
|
architectural coherence over 50+ steps
|
||||||
|
|
||||||
|
This is the dimension where frontier models most decisively outperform open models,
|
||||||
|
though the gap is narrowing with agentic RL training (Qwen3-Coder, GLM-5).
|
||||||
|
|
||||||
|
### State of the Art (SWE-bench Verified, March 2026)
|
||||||
|
|
||||||
|
| Model | Score | Type | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Claude Opus 4.5 | 80.9% | Frontier | Overall leader |
|
||||||
|
| Claude Opus 4.6 | 80.8% | Frontier | |
|
||||||
|
| Gemini 3.1 Pro | 80.6% | Frontier | |
|
||||||
|
| MiniMax M2.5 | 80.2% | Open | Best open model |
|
||||||
|
| GPT-5.2 | 80.0% | Frontier | |
|
||||||
|
| GLM-5 | 77.8% | Open | 744B MoE, 40B active |
|
||||||
|
| Kimi K2.5 | 76.8% | Open | |
|
||||||
|
| DeepSeek V3.2 | 73.0% | Open | |
|
||||||
|
| Qwen3-Coder-Next | 70.6% | Open | Only 3B active params |
|
||||||
|
| DeepSeek V3.1 | 66.0% | Open | |
|
||||||
|
| Nemotron 3 Super | 60.5% | Open | 120B, 12B active |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Debugging / Error Recovery
|
||||||
|
|
||||||
|
**Definition**: Handling test failures, reading error messages, diagnosing root causes,
|
||||||
|
and iterating toward a fix -- including recovering from the agent's own mistakes.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **Terminal-Bench 2.0** (Stanford/Laude) | 89 | CLI debugging, error recovery, state mgmt | Gold standard for debugging evaluation |
|
||||||
|
| **Recovery-Bench** (Letta, 2025) | Varied | Recovery from corrupted states and error traces | Tests context pollution handling |
|
||||||
|
| **AgentErrorBench** (2025) | Varied | Error detection and debugging in trajectories | 24% improvement with AgentDebug method |
|
||||||
|
| **ReliabilityBench** (Jan 2026) | Varied | Consistency and fault recovery | Multi-dimensional reliability |
|
||||||
|
| **Aider Polyglot** (indirectly) | 225 | Two-attempt model with error feedback | Second attempt tests debug-from-feedback |
|
||||||
|
|
||||||
|
### Recovery-Bench Key Findings
|
||||||
|
|
||||||
|
Recovery-Bench (Letta) specifically evaluates a critical gap: even frontier models
|
||||||
|
"lack the ability to naturally recover from failed states." The benchmark creates
|
||||||
|
scenarios with:
|
||||||
|
|
||||||
|
- Erroneous files from previous attempts
|
||||||
|
- Corrupted reasoning traces in context
|
||||||
|
- Environment artifacts from failed edits
|
||||||
|
|
||||||
|
This is directly relevant to agentic coding loops where an agent makes a mistake
|
||||||
|
at step 15 of a 30-step task and must recover without starting over.
|
||||||
|
|
||||||
|
### Terminal-Bench 2.0 Key Findings
|
||||||
|
|
||||||
|
Terminal-Bench tests real terminal workflows: inspect environments, read/edit files,
|
||||||
|
run commands, recover from errors, and finish multi-step tasks. Error categories:
|
||||||
|
|
||||||
|
- **Execution errors**: Dominate for Claude Opus 4.5 and GPT-5.2
|
||||||
|
- **Coherence errors**: Less frequent but more damaging
|
||||||
|
- **Verification errors**: Failing to check that a fix actually worked
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
Debugging/error recovery is one of the weakest dimensions for all models. No model
|
||||||
|
achieves >70% on Terminal-Bench 2.0 or Recovery-Bench as of March 2026. This is
|
||||||
|
a primary area where the frontier-open gap matters most for practical agentic use.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Repository Understanding
|
||||||
|
|
||||||
|
**Definition**: Navigating large codebases, understanding file structure, dependency
|
||||||
|
graphs, cross-file relationships, and architectural patterns.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | Languages | What It Tests | Notes |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| **CrossCodeEval** (NeurIPS 2023) | Varied | Python, Java, TS, C# | Cross-file code completion | Requires understanding imports and dependencies |
|
||||||
|
| **RepoBench** | 3 tasks | Python | Retrieval, completion, pipeline | Tests codebase navigation |
|
||||||
|
| **RepoEval** | Varied | Python | Repository-level completion | 16 GitHub repositories |
|
||||||
|
| **RepoCod** (ACL 2025) | Varied | Multiple | Full repository code generation | "LLMs not yet ready" |
|
||||||
|
| **LoCoBench-Agent** (2025) | Varied | Multiple | Interactive repo exploration | Agent-based evaluation |
|
||||||
|
| **DependEval** | 3 tasks | Multiple | Dependency recognition, multi-file editing | Tests architectural understanding |
|
||||||
|
|
||||||
|
### Key Challenge
|
||||||
|
|
||||||
|
Repository understanding is difficult to isolate as a benchmark dimension because
|
||||||
|
it is a prerequisite for most agentic coding tasks. SWE-bench implicitly tests it
|
||||||
|
(you cannot fix a bug if you cannot find the relevant file), but does not score it
|
||||||
|
separately.
|
||||||
|
|
||||||
|
The most direct measures are:
|
||||||
|
1. **CrossCodeEval**: Do predictions improve when cross-file context is provided?
|
||||||
|
2. **RepoBench-R**: Can the model retrieve the right context from the repository?
|
||||||
|
3. **DependEval**: Can the model understand and modify dependency relationships?
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
Models with longer context windows have an inherent advantage. The Qwen3-Coder family
|
||||||
|
was explicitly trained for "repository-scale understanding" with 256K native context
|
||||||
|
(extendable to 1M). GLM-5 uses DeepSeek Sparse Attention for 205K context.
|
||||||
|
|
||||||
|
For 64GB systems, Qwen3-Coder-30B-A3B and Qwen3-Coder-Next are the strongest choices
|
||||||
|
due to their long-context training and MoE efficiency.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Instruction Following
|
||||||
|
|
||||||
|
**Definition**: Following complex, multi-constraint instructions precisely --
|
||||||
|
formatting requirements, length constraints, keyword inclusion, structural rules.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **IFEval** (Google, Nov 2023) | ~500 | 25 types of verifiable instructions | Format, length, keyword, structure constraints |
|
||||||
|
| **IFEval-Extended** (2024) | Dynamic | Generative instruction synthesis | Thousands of unique instructions from templates |
|
||||||
|
| **M-IFEval** (NAACL 2025) | Multi-lingual | French, Japanese, Spanish instruction following | Performance varies widely across languages |
|
||||||
|
| **IFEval-FC** (2025) | Varied | Instruction following in function call schemas | JSON schema constraint adherence |
|
||||||
|
| **AgentIF** (Tsinghua, 2025) | Varied | Agent-specific instruction following | Evaluates IF within agentic loops |
|
||||||
|
|
||||||
|
### Relevance to Agentic Coding
|
||||||
|
|
||||||
|
Instruction following is critical for agentic coding because:
|
||||||
|
|
||||||
|
1. **System prompts**: Agents receive detailed behavioral instructions (e.g., CLAUDE.md
|
||||||
|
conventions in this repo)
|
||||||
|
2. **Edit format compliance**: Models must produce output in exact formats (search/replace
|
||||||
|
blocks, unified diffs, JSON tool calls)
|
||||||
|
3. **Multi-constraint tasks**: "Fix the bug AND add a test AND update the docstring AND
|
||||||
|
follow the project's naming conventions"
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
IFEval is included in the Open LLM Leaderboard V2, making it one of the most widely
|
||||||
|
reported benchmarks. Frontier models score >90% on IFEval. Open models vary widely;
|
||||||
|
instruction-tuned variants of Qwen3.5, DeepSeek V3, and GLM-5 are competitive at >85%.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Long Context Utilization
|
||||||
|
|
||||||
|
**Definition**: Effectively using large context windows (32K-1M tokens) with code --
|
||||||
|
not just accepting long inputs, but actually using information from all parts.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | What It Tests | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| **RULER** (NVIDIA, COLM 2024) | Multi-needle retrieval, distractor handling | Most models degrade significantly beyond 32K |
|
||||||
|
| **Needle in a Haystack** (NIAH) | Single-fact retrieval in long context | Near-saturated for frontier models |
|
||||||
|
| **LoCoBench** (2025) | Long-context code completion and comprehension | Claude 3.5 Sonnet: 29% at short context, 3% at long |
|
||||||
|
| **LongCodeBench** (2025) | Long-context code tasks | Single-language, limited diversity |
|
||||||
|
| **LongBench** (ACL 2025) | General long-context evaluation | Reveals limitations of existing benchmarks |
|
||||||
|
|
||||||
|
### "Context Rot" Phenomenon
|
||||||
|
|
||||||
|
Research from Chroma (2025) documented "context rot": as input tokens increase,
|
||||||
|
LLM performance degrades even when the relevant information is present. This is
|
||||||
|
particularly acute for code tasks where:
|
||||||
|
|
||||||
|
- File A defines a class, file B imports it, file C tests it
|
||||||
|
- All three must be in context simultaneously
|
||||||
|
- Models must cross-reference across files, not just retrieve individual facts
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
| Model | Native Context | Effective Context* | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Nemotron 3 Super | 1M tokens | 91.75% accuracy at 1M | Best retention score |
|
||||||
|
| Qwen3-Coder-Next | 256K (1M w/ Yarn) | Good at 256K | Trained for repo-scale |
|
||||||
|
| GLM-5 | 205K | Good | DeepSeek Sparse Attention |
|
||||||
|
| DeepSeek V3.2 | 128K | Moderate | |
|
||||||
|
|
||||||
|
*"Effective context" means the model actually uses information at that distance,
|
||||||
|
not just accepts it without error.
|
||||||
|
|
||||||
|
For 64GB systems, context length is bounded by available memory. At Q4 quantization,
|
||||||
|
a 30B-A3B model can handle ~64K-128K tokens before running out of KV cache space
|
||||||
|
(depending on GQA configuration and batch size).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Multi-Language Support
|
||||||
|
|
||||||
|
**Definition**: Handling different programming languages correctly -- not just Python,
|
||||||
|
but also compiled languages, systems languages, and less common languages.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Languages | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **Aider Polyglot** | C++, Go, Java, JS, Python, Rust | Edit + debug in 6 languages | 225 Exercism exercises |
|
||||||
|
| **Multi-SWE-bench** (NeurIPS 2025) | Python, Java, TS, JS, Go, Rust, C, C++ | Issue resolution in 8 languages | 1,632 validated issues |
|
||||||
|
| **Multi-SWE-bench mini** | 8 languages | Lightweight version | 400 instances, reduced compute |
|
||||||
|
| **SWE-PolyBench** (Amazon) | Java, JS, TS, Python | Bug fixes, features, refactoring | 2,110 curated issues |
|
||||||
|
| **SWE-smith** | 9 languages | SWE-bench style across 42 repos | 300 curated tasks |
|
||||||
|
| **HumanEval-X** | Python, C++, Java, JS, Go | Cross-lingual code generation | Translation of HumanEval |
|
||||||
|
| **BigCodeBench** | Python (139 libs) | Multi-library Python | Tests library-specific knowledge |
|
||||||
|
|
||||||
|
### Multi-SWE-bench vs SWE-PolyBench
|
||||||
|
|
||||||
|
Two competing multilingual benchmarks emerged in 2025:
|
||||||
|
|
||||||
|
- **Multi-SWE-bench** (ByteDance): 1,632 issues, 8 languages, NeurIPS 2025
|
||||||
|
Datasets track. Also provides `mini` (400 instances) and `flash` (300 instances)
|
||||||
|
variants for reduced compute.
|
||||||
|
- **SWE-PolyBench** (Amazon): 2,110 issues, 4 languages, with a verified subset of
|
||||||
|
384 instances. Covers bug fixes, features, and refactoring.
|
||||||
|
|
||||||
|
### Language-Specific Performance Gaps
|
||||||
|
|
||||||
|
Open models show significant performance variation across languages:
|
||||||
|
- **Python**: Best-supported universally
|
||||||
|
- **JavaScript/TypeScript**: Second-best, strong ecosystem coverage
|
||||||
|
- **Rust, Go, C++**: Substantially weaker, especially for complex patterns
|
||||||
|
- **Low-resource languages** (Julia, Lua, Perl): StarCoder2-15B historically strong here
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
Qwen3-Coder-Next achieves 62.8% on SWE-Bench Multilingual. For 64GB-feasible models,
|
||||||
|
the Qwen3-Coder-30B-A3B benefits from Qwen's broad multilingual training data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Test Generation
|
||||||
|
|
||||||
|
**Definition**: Writing tests, understanding test frameworks, achieving coverage,
|
||||||
|
generating meaningful assertions -- not just syntactically valid tests.
|
||||||
|
|
||||||
|
### Key Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Tasks | What It Tests | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **TestEval** (2024) | 210 | LLM test case generation for LeetCode programs | Basic test generation ability |
|
||||||
|
| **ULT** (2025) | 3,909 | Unit test generation for complex functions | High cyclomatic complexity, leakage-free |
|
||||||
|
| **WebApp1K** (2025) | 1,000 | Test-driven development tasks | Tests serve as both prompt and verification |
|
||||||
|
| **CoverUp** (2024) | Varied | Coverage-guided test generation | Iterative LLM-guided coverage improvement |
|
||||||
|
|
||||||
|
### Current Performance
|
||||||
|
|
||||||
|
LLM-generated tests achieve on average:
|
||||||
|
- **41.32%** accuracy (tests pass and are meaningful)
|
||||||
|
- **45.10%** statement coverage
|
||||||
|
- **30.22%** branch coverage
|
||||||
|
- **40.21%** mutation score
|
||||||
|
|
||||||
|
These numbers are from a multi-model benchmark study (2025). CoverUp's iterative
|
||||||
|
approach achieves 80% line+branch coverage (vs 47% for CodaMosa), suggesting that
|
||||||
|
agentic test generation loops significantly outperform single-shot generation.
|
||||||
|
|
||||||
|
### Key Insight
|
||||||
|
|
||||||
|
Test generation is an area where agentic approaches (generate, run, check coverage,
|
||||||
|
iterate) dramatically outperform single-shot generation. This makes it particularly
|
||||||
|
suited to the iterative agent loop and a strong candidate for local model evaluation.
|
||||||
|
|
||||||
|
### State of the Art
|
||||||
|
|
||||||
|
Code agents were shown to be "state of the art software testers" when given an
|
||||||
|
iterative loop with coverage feedback (2024 paper). No single model dominates this
|
||||||
|
dimension; the scaffolding (coverage feedback, iteration) matters more than the
|
||||||
|
base model for test generation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Benchmark Suite Summary
|
||||||
|
|
||||||
|
### Tier 1: Must-Run for Agentic Coding Evaluation
|
||||||
|
|
||||||
|
These are the most informative benchmarks for evaluating a model's fitness as a
|
||||||
|
coding agent:
|
||||||
|
|
||||||
|
| Benchmark | Primary Dimensions | Run Cost | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **SWE-bench Verified** | Planning, editing, repo understanding | High (500 Docker envs) | Gold standard |
|
||||||
|
| **Aider Polyglot** | Editing, multi-lang, debugging | Medium (225 problems) | Best edit benchmark |
|
||||||
|
| **BigCodeBench** | Generation, multi-tool | Medium (1,140 tasks) | Best generation benchmark |
|
||||||
|
| **BFCL V4** | Tool use, function calling | Low-Medium | De facto tool-use standard |
|
||||||
|
| **Terminal-Bench 2.0** | Debugging, planning, error recovery | High (89 real envs) | Best debugging benchmark |
|
||||||
|
|
||||||
|
### Tier 2: Valuable Supplementary Benchmarks
|
||||||
|
|
||||||
|
| Benchmark | Primary Dimensions | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| **LiveCodeBench** | Generation (contamination-free) | Rolling benchmark |
|
||||||
|
| **IFEval** | Instruction following | Quick to run, widely reported |
|
||||||
|
| **Multi-SWE-bench mini** | Multi-language, planning | 400 instances, 8 languages |
|
||||||
|
| **EvalPlus (HumanEval+/MBPP+)** | Generation (rigorous) | Good baseline |
|
||||||
|
| **Recovery-Bench** | Error recovery | Novel and underexplored |
|
||||||
|
| **FeatureBench** | Complex planning | Very hard; differentiates top models |
|
||||||
|
|
||||||
|
### Tier 3: Niche or Near-Saturated
|
||||||
|
|
||||||
|
| Benchmark | Status | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| **HumanEval** | Near-saturated | >95% for frontier models; use EvalPlus instead |
|
||||||
|
| **MBPP** | Near-saturated | Use MBPP+ instead |
|
||||||
|
| **GAIA** | Near-saturation (~90%) | Good for general agents, less code-specific |
|
||||||
|
| **Needle-in-a-Haystack** | Saturated | Use RULER for long-context |
|
||||||
|
|
||||||
|
### Commonly Cited on Model Cards
|
||||||
|
|
||||||
|
When coding-focused models publish on Hugging Face, the most frequently cited
|
||||||
|
benchmarks (in rough order of frequency) are:
|
||||||
|
|
||||||
|
1. SWE-bench Verified (agentic coding standard)
|
||||||
|
2. HumanEval / HumanEval+ (code generation baseline)
|
||||||
|
3. MBPP / MBPP+ (code generation)
|
||||||
|
4. BigCodeBench (multi-tool generation)
|
||||||
|
5. Aider Polyglot (code editing, multi-language)
|
||||||
|
6. LiveCodeBench (contamination-free generation)
|
||||||
|
7. BFCL (function calling)
|
||||||
|
8. IFEval (instruction following)
|
||||||
|
9. Multi-SWE-bench (multilingual agentic)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13. Open-Weight Model Landscape for 64GB Systems
|
||||||
|
|
||||||
|
### Models Feasible on 64GB Unified Memory (Strix Halo)
|
||||||
|
|
||||||
|
Sorted by practical fitness for agentic coding tasks. "Active" = parameters active
|
||||||
|
per forward pass for MoE models.
|
||||||
|
|
||||||
|
| Model | Total / Active | GGUF Q4 Size | SWE-bench | Key Strength |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| **Qwen3-Coder-Next** | 80B / 3B | ~46GB (Q4) | 70.6% Verified | Best efficiency ratio; agentic RL training |
|
||||||
|
| **Qwen3-Coder-30B-A3B** | 30.5B / 3.3B | ~18GB (Q4) | ~55%* (est.) | Fits easily; native 256K context; function call format |
|
||||||
|
| **Qwen3.5-35B-A3B** | 35B / 3B | ~19GB (Q4) | N/A | General + coding; fast at 112 tok/s on RTX 3090 |
|
||||||
|
| **Nemotron 3 Super** | 120B / 12B | ~64GB (Q4) | 60.5% | 1M context; PinchBench 85.6%; hybrid Mamba-Transformer |
|
||||||
|
| **Qwen3.5-27B** | 27B / 27B (dense) | ~17GB (Q4) | ~55%* | Dense; 72.4% SWE-bench reported for Qwen3.5-27B |
|
||||||
|
| **DeepSeek V3.2** | 671B / 37B | Too large at Q4 | 73.0% | Requires >200GB; not feasible for 64GB |
|
||||||
|
| **GLM-5** | 744B / 40B | Too large at Q4 | 77.8% | Best open SWE-bench; not feasible for 64GB |
|
||||||
|
|
||||||
|
*Estimated; exact scores for quantized GGUF variants not independently benchmarked.
|
||||||
|
|
||||||
|
### Recommended Configuration for 64GB Strix Halo
|
||||||
|
|
||||||
|
**Primary coding agent**: Qwen3-Coder-30B-A3B-Instruct (Q4_K_M, ~18GB)
|
||||||
|
- Fits with ample room for KV cache and context
|
||||||
|
- Specially designed function call format
|
||||||
|
- Native 256K context, extendable to 1M
|
||||||
|
- Strong agentic coding training
|
||||||
|
- Fast inference due to 3.3B active parameters
|
||||||
|
|
||||||
|
**Stretch option**: Qwen3-Coder-Next (Q4, ~46GB)
|
||||||
|
- Tighter fit but significantly stronger (70.6% SWE-bench Verified)
|
||||||
|
- 3B active parameters = good generation speed
|
||||||
|
- Leaves ~18GB for KV cache and system
|
||||||
|
|
||||||
|
**Dense alternative**: Qwen3.5-27B (Q4_K_M, ~17GB)
|
||||||
|
- When you need strong general + coding ability
|
||||||
|
- Dense model = more predictable behavior
|
||||||
|
- Good baseline for comparison
|
||||||
|
|
||||||
|
### Older Models: Still Relevant?
|
||||||
|
|
||||||
|
- **CodeLlama-34B** (Meta, 2023): Superseded by Qwen and DeepSeek families. Only
|
||||||
|
relevant for historical comparison or if specific fine-tunes are needed.
|
||||||
|
- **StarCoder2-15B** (ServiceNow/HF/NVIDIA, 2024): Outperformed CodeLlama-34B at half
|
||||||
|
the size. Still competitive for low-resource languages (Julia, Lua, Perl) but
|
||||||
|
otherwise superseded by Qwen3-Coder.
|
||||||
|
- **DeepSeek-Coder-V2-Lite-16B** (2024): Was competitive but now clearly behind
|
||||||
|
Qwen3-Coder-30B-A3B and Qwen3-Coder-Next.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14. Frontier vs. Open Model Gap
|
||||||
|
|
||||||
|
### Gap Analysis by Dimension (March 2026)
|
||||||
|
|
||||||
|
| Dimension | Frontier Best | Open Best (64GB) | Gap | Trend |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| Code Generation | ~98% HumanEval | ~85% HumanEval | Small | Closing rapidly |
|
||||||
|
| Code Editing | 88% Aider Polyglot | ~60% Aider Polyglot | Large | Closing (MoE helps) |
|
||||||
|
| Tool Use | >90% BFCL | ~80% BFCL | Moderate | Closing with dedicated training |
|
||||||
|
| Multi-Step Planning | 80.9% SWE-bench | 70.6% SWE-bench (Coder-Next) | Moderate | Narrowing with agentic RL |
|
||||||
|
| Debugging/Recovery | ~65% Terminal-Bench | ~45% Terminal-Bench* | Large | Widest persistent gap |
|
||||||
|
| Repo Understanding | Excellent | Good (long-context models) | Moderate | Closing with 256K+ contexts |
|
||||||
|
| Instruction Following | >90% IFEval | >85% IFEval | Small | Nearly closed |
|
||||||
|
| Long Context | 1M+ effective | 256K effective | Moderate | Hardware-limited for local |
|
||||||
|
| Multi-Language | 80%+ Multi-SWE | 62.8% Multi-SWE | Moderate | Improving with diverse training |
|
||||||
|
| Test Generation | ~50% coverage | ~40% coverage | Small | Scaffolding matters more |
|
||||||
|
|
||||||
|
*Estimated; Terminal-Bench scores not widely reported for 64GB-feasible open models.
|
||||||
|
|
||||||
|
### Key Observations
|
||||||
|
|
||||||
|
1. **Code generation is nearly solved** for simple tasks. The gap has shifted to
|
||||||
|
complex, multi-step, multi-file tasks.
|
||||||
|
|
||||||
|
2. **Debugging/error recovery is the widest gap** and the hardest to close. This is
|
||||||
|
where frontier models' larger parameter counts and RLHF refinement matter most.
|
||||||
|
|
||||||
|
3. **MoE architectures are the bridge** for 64GB systems. Models like Qwen3-Coder-Next
|
||||||
|
(80B total, 3B active) achieve SWE-bench scores comparable to models with 10-20x
|
||||||
|
more active parameters.
|
||||||
|
|
||||||
|
4. **Agentic RL training** (as used in Qwen3-Coder, GLM-5) is the primary driver of
|
||||||
|
open model improvement on planning and debugging dimensions.
|
||||||
|
|
||||||
|
5. **Scaffolding equalizes** many gaps. A well-designed agent scaffold (SWE-Agent,
|
||||||
|
OpenHands, Aider) can make a 30B model perform comparably to a raw 400B model.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 15. Recommended Evaluation Stack
|
||||||
|
|
||||||
|
For evaluating models locally on the Strix Halo system, the following stack covers
|
||||||
|
all 10 dimensions using tools already referenced in this project's `docs/references.md`:
|
||||||
|
|
||||||
|
### Inspect AI (Primary Framework)
|
||||||
|
|
||||||
|
Inspect AI supports multiple benchmarks in a unified framework:
|
||||||
|
- HumanEval (code generation)
|
||||||
|
- BigCodeBench (multi-tool generation)
|
||||||
|
- BFCL (function calling / tool use)
|
||||||
|
- GAIA (multi-step planning)
|
||||||
|
- IFEval (instruction following)
|
||||||
|
|
||||||
|
Run against an OpenAI-compatible endpoint (ollama or llama.cpp server).
|
||||||
|
|
||||||
|
### EvalPlus (Code Generation)
|
||||||
|
|
||||||
|
- HumanEval+ and MBPP+ with native ollama support
|
||||||
|
- More rigorous than base HumanEval/MBPP
|
||||||
|
- Already configured in this project's `scripts/agentic/` framework
|
||||||
|
|
||||||
|
### BigCodeBench (Multi-Tool Generation)
|
||||||
|
|
||||||
|
- 1,140 tasks across 139 libraries
|
||||||
|
- Already listed in `docs/references.md`
|
||||||
|
- Tests multi-library, cross-domain code generation
|
||||||
|
|
||||||
|
### Aider (Code Editing + Multi-Language)
|
||||||
|
|
||||||
|
- Built-in polyglot benchmark: 225 exercises across 6 languages
|
||||||
|
- Tests edit format compliance, multi-language support, debugging loop
|
||||||
|
- Can be run against any OpenAI-compatible endpoint
|
||||||
|
|
||||||
|
### BFCL (Tool Use)
|
||||||
|
|
||||||
|
- pip install `bfcl-eval`
|
||||||
|
- Tests function calling accuracy
|
||||||
|
- Already listed in `docs/references.md`
|
||||||
|
|
||||||
|
### Practical Execution Order
|
||||||
|
|
||||||
|
1. **Quick smoke test**: EvalPlus (HumanEval+) -- ~30 min
|
||||||
|
2. **Generation depth**: BigCodeBench-Hard (148 tasks) -- ~2-4 hours
|
||||||
|
3. **Editing ability**: Aider polyglot benchmark -- ~4-6 hours
|
||||||
|
4. **Tool use**: BFCL eval -- ~1-2 hours
|
||||||
|
5. **Instruction following**: IFEval via Inspect AI -- ~1 hour
|
||||||
|
6. **Full agentic**: SWE-bench Verified (if Docker resources available) -- ~24+ hours
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 16. Sources
|
||||||
|
|
||||||
|
### Papers
|
||||||
|
|
||||||
|
- Chen et al. (2021). "Evaluating Large Language Models Trained on Code." arXiv:2107.03374. [HumanEval]
|
||||||
|
- Liu et al. (2023). "Is Your Code Generated by ChatGPT Really Correct?" NeurIPS 2023. [EvalPlus/HumanEval+]
|
||||||
|
- Jimenez et al. (2024). "SWE-bench: Can Language Models Resolve Real-world GitHub Issues?" ICLR 2024.
|
||||||
|
- Zhuo et al. (2024). "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls." ICLR 2025.
|
||||||
|
- Patil et al. (2025). "The Berkeley Function Calling Leaderboard (BFCL)." ICML 2025.
|
||||||
|
- Misu et al. (2023). "Towards AI Assistants That Thrive on Data: GAIA." ICLR 2025.
|
||||||
|
- Hsieh et al. (2024). "RULER: What's the Real Context Size of Your Long-Context Language Models?" COLM 2024.
|
||||||
|
- Zhou et al. (2023). "Instruction-Following Evaluation for Large Language Models." arXiv:2311.07911. [IFEval]
|
||||||
|
- Terminal-Bench team (2026). "Terminal-Bench: Benchmarking Agents on Hard CLI Tasks." Stanford/Laude Institute.
|
||||||
|
- FeatureBench (Feb 2026). "Benchmarking Agentic Coding for Complex Feature Development." arXiv:2602.10975.
|
||||||
|
- HumanEval Pro / MBPP Pro (ACL 2025). "Evaluating LLMs on Self-invoking Code Generation Task."
|
||||||
|
- Multi-SWE-bench (NeurIPS 2025). "A Multilingual Benchmark for Issue Resolving."
|
||||||
|
- SWE-PolyBench (Amazon, 2025). "A multi-language benchmark for repository level evaluation."
|
||||||
|
- Recovery-Bench (Letta, 2025). "Evaluating LLMs' Ability to Recover from Mistakes."
|
||||||
|
- Diff-XYZ (Oct 2025). "A Benchmark for Evaluating Diff Understanding."
|
||||||
|
|
||||||
|
### Leaderboards and Live Data
|
||||||
|
|
||||||
|
- SWE-bench Leaderboard: https://www.swebench.com/
|
||||||
|
- SWE-bench Verified Leaderboard: https://llm-stats.com/benchmarks/swe-bench-verified
|
||||||
|
- SWE-rebench Leaderboard: https://swe-rebench.com/
|
||||||
|
- Aider LLM Leaderboards: https://aider.chat/docs/leaderboards/
|
||||||
|
- BFCL V4 Leaderboard: https://gorilla.cs.berkeley.edu/leaderboard.html
|
||||||
|
- EvalPlus Leaderboard: https://evalplus.github.io/leaderboard.html
|
||||||
|
- BigCodeBench Leaderboard: https://huggingface.co/blog/leaderboard-bigcodebench
|
||||||
|
- Terminal-Bench Leaderboard: https://www.tbench.ai/
|
||||||
|
- Open LLM Leaderboard: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
|
||||||
|
- Scale Labs SWE-bench Pro: https://labs.scale.com/leaderboard/swe_bench_pro_public
|
||||||
|
- Artificial Analysis Terminal-Bench: https://artificialanalysis.ai/evaluations/terminalbench-hard
|
||||||
|
|
||||||
|
### Model Documentation
|
||||||
|
|
||||||
|
- Qwen3-Coder: https://github.com/QwenLM/Qwen3-Coder
|
||||||
|
- Qwen3-Coder-Next: https://qwen.ai/blog?id=qwen3-coder-next
|
||||||
|
- Qwen3-Coder-30B-A3B GGUF: https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF
|
||||||
|
- GLM-5: https://huggingface.co/zai-org/GLM-5
|
||||||
|
- Nemotron 3 Super: https://developer.nvidia.com/blog/introducing-nemotron-3-super-an-open-hybrid-mamba-transformer-moe-for-agentic-reasoning/
|
||||||
|
- DeepSeek V3 series: https://www.bentoml.com/blog/the-complete-guide-to-deepseek-models-from-v3-to-r1-and-beyond
|
||||||
|
|
||||||
|
### Tools and Frameworks
|
||||||
|
|
||||||
|
- Inspect AI: https://github.com/UKGovernmentBEIS/inspect_ai
|
||||||
|
- Inspect Evals catalog: https://inspect.aisi.org.uk/evals/
|
||||||
|
- EvalPlus: https://github.com/evalplus/evalplus
|
||||||
|
- BigCodeBench: https://github.com/bigcode-project/bigcodebench
|
||||||
|
- BFCL: https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard
|
||||||
|
- Aider: https://aider.chat/
|
||||||
|
- Aider Polyglot benchmark: https://github.com/Aider-AI/polyglot-benchmark
|
||||||
|
- LiveCodeBench: https://livecodebench.github.io/
|
||||||
|
- CoverUp (test generation): https://arxiv.org/html/2403.16218v3
|
||||||
@@ -41,7 +41,8 @@ The `-fa 1 -mmp 0 -ngl 99` flags are **mandatory** on Strix Halo to avoid crashe
|
|||||||
| `llama-vulkan-radv` | Mesa RADV | Vulkan | Most stable, recommended default |
|
| `llama-vulkan-radv` | Mesa RADV | Vulkan | Most stable, recommended default |
|
||||||
| `llama-vulkan-amdvlk` | AMDVLK | Vulkan | Fastest when it works, 2GB buffer limit |
|
| `llama-vulkan-amdvlk` | AMDVLK | Vulkan | Fastest when it works, 2GB buffer limit |
|
||||||
| `llama-rocm-6.4.4` | ROCm 6.4.4 | HIP | Proven stable |
|
| `llama-rocm-6.4.4` | ROCm 6.4.4 | HIP | Proven stable |
|
||||||
| `llama-rocm-7.2` | ROCm 7.2 | HIP | Latest, compiler fixes applied |
|
| `llama-rocm-7.2.1` | ROCm 7.2.1 | HIP | Current stable (kernel 6.18.4+ patch) |
|
||||||
|
| `llama-rocm-7.2` | ROCm 7.2 | HIP | Deprecated — use 7.2.1 |
|
||||||
| `llama-rocm7-nightlies` | ROCm 7 nightly | HIP | Experimental/development builds |
|
| `llama-rocm7-nightlies` | ROCm 7 nightly | HIP | Experimental/development builds |
|
||||||
|
|
||||||
Containers are from [kyuz0/amd-strix-halo-toolboxes](https://github.com/kyuz0/amd-strix-halo-toolboxes). Set up with `make benchmark-setup`.
|
Containers are from [kyuz0/amd-strix-halo-toolboxes](https://github.com/kyuz0/amd-strix-halo-toolboxes). Set up with `make benchmark-setup`.
|
||||||
|
|||||||
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Agentic evaluation frameworks
|
||||||
|
# Install: python3.13 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt
|
||||||
|
# Requires Python >=3.10, <3.14 (bigcodebench constraint)
|
||||||
|
|
||||||
|
inspect-ai>=0.3.201
|
||||||
|
inspect-evals>=0.6.0
|
||||||
|
evalplus>=0.3.1
|
||||||
|
bigcodebench>=0.2.5
|
||||||
|
openai>=2.26.0
|
||||||
|
|
||||||
|
# IFEval dependency (not on PyPI)
|
||||||
|
instruction_following_eval @ git+https://github.com/josejg/instruction_following_eval
|
||||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
|||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||||
|
|
||||||
VENV_DIR="$(data_dir venv)"
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||||
EVAL_DIR="$(data_dir evals)"
|
EVAL_DIR="$(data_dir evals)"
|
||||||
|
|
||||||
# ── Argument parsing ─────────────────────────────────────
|
# ── Argument parsing ─────────────────────────────────────
|
||||||
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
# ── Validation ───────────────────────────────────────────
|
# ── Validation ───────────────────────────────────────────
|
||||||
if [[ -z "$MODEL" ]]; then
|
|
||||||
log_error "Model name required. Use --model NAME"
|
|
||||||
log_info "Examples:"
|
|
||||||
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
|
||||||
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||||
log_error "Virtual environment not found. Run: make agentic-setup"
|
log_error "Virtual environment not found. Run: make agentic-setup"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
source "$VENV_DIR/bin/activate"
|
source "$VENV_DIR/bin/activate"
|
||||||
|
|
||||||
# Check server is reachable
|
# Auto-detect server if no explicit endpoint given
|
||||||
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
|
||||||
# Try ollama native endpoint
|
if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
|
||||||
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
ENDPOINT="http://localhost:8080/v1"
|
||||||
log_info "Ollama detected, using OpenAI-compat endpoint"
|
log_info "Auto-detected llama-server at localhost:8080"
|
||||||
|
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
||||||
|
log_info "Auto-detected ollama at localhost:11434"
|
||||||
else
|
else
|
||||||
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
|
log_error "No LLM server found. Start one first:"
|
||||||
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||||
|
log_info " ollama serve (ollama)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
|
||||||
|
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
||||||
|
log_error "No LLM server at $ENDPOINT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Auto-detect model name from server if not provided
|
||||||
|
if [[ -z "$MODEL" ]]; then
|
||||||
|
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
models = data.get('data', [])
|
||||||
|
if models:
|
||||||
|
print(models[0].get('id', ''))
|
||||||
|
except: pass
|
||||||
|
" 2>/dev/null || true)
|
||||||
|
if [[ -n "$DETECTED_MODEL" ]]; then
|
||||||
|
MODEL="$DETECTED_MODEL"
|
||||||
|
log_info "Auto-detected model: $MODEL"
|
||||||
|
else
|
||||||
|
log_error "Model name required. Use --model NAME"
|
||||||
|
log_info "Examples:"
|
||||||
|
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
||||||
|
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
TS="$(timestamp)"
|
TS="$(timestamp)"
|
||||||
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
|
SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
|
||||||
|
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
|
||||||
mkdir -p "$RUN_DIR"
|
mkdir -p "$RUN_DIR"
|
||||||
|
|
||||||
log_header "Agentic Evaluation: $SUITE"
|
log_header "Agentic Evaluation: $SUITE"
|
||||||
@@ -86,7 +112,11 @@ ENDJSON
|
|||||||
METRICS_FILE="$RUN_DIR/metrics.csv"
|
METRICS_FILE="$RUN_DIR/metrics.csv"
|
||||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
||||||
METRICS_PID=$!
|
METRICS_PID=$!
|
||||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
|
cleanup() {
|
||||||
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# ── Suite execution ──────────────────────────────────────
|
# ── Suite execution ──────────────────────────────────────
|
||||||
|
|
||||||
@@ -113,14 +143,14 @@ run_evalplus() {
|
|||||||
run_inspect_eval() {
|
run_inspect_eval() {
|
||||||
local eval_name="$1"
|
local eval_name="$1"
|
||||||
local display_name="$2"
|
local display_name="$2"
|
||||||
|
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
|
||||||
log_info "Running Inspect AI: $display_name..."
|
log_info "Running Inspect AI: $display_name..."
|
||||||
local out="$RUN_DIR/inspect-${eval_name}.json"
|
|
||||||
|
|
||||||
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
||||||
inspect eval "$eval_name" \
|
inspect eval "$eval_name" \
|
||||||
--model "openai/$MODEL" \
|
--model "openai/$MODEL" \
|
||||||
--log-dir "$RUN_DIR/inspect-logs/" \
|
--log-dir "$RUN_DIR/inspect-logs/" \
|
||||||
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
|
2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
|
||||||
|
|
||||||
log_success "Inspect $display_name complete"
|
log_success "Inspect $display_name complete"
|
||||||
}
|
}
|
||||||
@@ -138,7 +168,7 @@ run_bigcodebench() {
|
|||||||
case "$SUITE" in
|
case "$SUITE" in
|
||||||
quick)
|
quick)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||||
;;
|
;;
|
||||||
code)
|
code)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
@@ -146,13 +176,13 @@ case "$SUITE" in
|
|||||||
run_bigcodebench
|
run_bigcodebench
|
||||||
;;
|
;;
|
||||||
tooluse)
|
tooluse)
|
||||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||||
;;
|
;;
|
||||||
full)
|
full)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
run_evalplus "mbpp"
|
run_evalplus "mbpp"
|
||||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||||
run_bigcodebench
|
run_bigcodebench
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
|
|||||||
log_header "Agentic Evaluation Setup"
|
log_header "Agentic Evaluation Setup"
|
||||||
|
|
||||||
# ── Python virtual environment ───────────────────────────
|
# ── Python virtual environment ───────────────────────────
|
||||||
VENV_DIR="$(data_dir venv)"
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||||
|
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
|
||||||
|
|
||||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||||
log_info "Creating Python virtual environment..."
|
# Prefer Python 3.13 (bigcodebench requires <3.14)
|
||||||
python3 -m venv "$VENV_DIR"
|
PYTHON_BIN="python3.13"
|
||||||
|
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
||||||
|
PYTHON_BIN="python3"
|
||||||
|
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
|
||||||
|
fi
|
||||||
|
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
|
||||||
|
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
||||||
log_success "Virtual environment created at $VENV_DIR"
|
log_success "Virtual environment created at $VENV_DIR"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
source "$VENV_DIR/bin/activate"
|
source "$VENV_DIR/bin/activate"
|
||||||
log_info "Python: $(python3 --version) from $VENV_DIR"
|
log_info "Python: $(python3 --version) from $VENV_DIR"
|
||||||
|
|
||||||
# ── Install evaluation frameworks ────────────────────────
|
# ── Install from requirements.txt ────────────────────────
|
||||||
|
log_info "Installing dependencies from requirements.txt..."
|
||||||
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
|
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
|
||||||
if python3 -c "import inspect_ai" 2>/dev/null; then
|
log_success "Dependencies installed"
|
||||||
log_success "inspect-ai already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing inspect-ai (main eval framework)..."
|
|
||||||
pip install inspect-ai 2>&1 | tail -3
|
|
||||||
log_success "inspect-ai installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
|
|
||||||
if python3 -c "import evalplus" 2>/dev/null; then
|
|
||||||
log_success "evalplus already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing evalplus (code generation benchmarks)..."
|
|
||||||
pip install evalplus 2>&1 | tail -3
|
|
||||||
log_success "evalplus installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# BigCodeBench
|
|
||||||
if python3 -c "import bigcodebench" 2>/dev/null; then
|
|
||||||
log_success "bigcodebench already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing bigcodebench..."
|
|
||||||
pip install bigcodebench 2>&1 | tail -3
|
|
||||||
log_success "bigcodebench installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ── Check for local LLM server ──────────────────────────
|
# ── Check for local LLM server ──────────────────────────
|
||||||
log_header "LLM Server Check"
|
log_header "LLM Server Check"
|
||||||
|
|
||||||
ollama_ok=false
|
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
|
||||||
llamacpp_ok=false
|
log_success "llama-server running at localhost:8080"
|
||||||
|
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||||
if is_cmd ollama; then
|
|
||||||
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
|
|
||||||
log_success "ollama running at localhost:11434"
|
log_success "ollama running at localhost:11434"
|
||||||
ollama_ok=true
|
|
||||||
# List available models
|
|
||||||
log_info "Available ollama models:"
|
|
||||||
ollama list 2>/dev/null | head -10 || true
|
|
||||||
else
|
else
|
||||||
log_warn "ollama installed but not running. Start with: ollama serve"
|
log_warn "No local LLM server running. Start one before running evals:"
|
||||||
fi
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||||
else
|
log_info " ollama serve (ollama)"
|
||||||
log_info "ollama not installed — needed for most agentic benchmarks"
|
|
||||||
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check for llama.cpp server
|
|
||||||
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
|
|
||||||
log_success "llama.cpp server running at localhost:8080"
|
|
||||||
llamacpp_ok=true
|
|
||||||
else
|
|
||||||
log_info "No llama.cpp server detected at localhost:8080"
|
|
||||||
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! $ollama_ok && ! $llamacpp_ok; then
|
|
||||||
log_warn "No local LLM server running. Agentic benchmarks need one."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ── Summary ──────────────────────────────────────────────
|
# ── Summary ──────────────────────────────────────────────
|
||||||
log_header "Setup Complete"
|
log_header "Setup Complete"
|
||||||
echo ""
|
echo ""
|
||||||
echo " Installed tools:"
|
echo " Installed tools:"
|
||||||
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
|
echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
|
||||||
|
echo " inspect-evals — Task definitions for inspect-ai"
|
||||||
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
||||||
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
||||||
echo ""
|
echo ""
|
||||||
echo " To activate the virtual environment:"
|
echo " Activate venv: source .venv/bin/activate"
|
||||||
echo " source data/venv/bin/activate"
|
|
||||||
echo ""
|
echo ""
|
||||||
echo " Run evaluations:"
|
echo " Run evaluations:"
|
||||||
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
|
echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
|
||||||
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
|
echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
|
||||||
|
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
|
||||||
|
echo " make agentic-full # All of the above (~5-6 hours)"
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
@@ -23,12 +23,14 @@ PP_TOKENS=512
|
|||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
BACKENDS_FILTER="llama-vulkan-radv" # Default to Vulkan; use --backends to override
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
||||||
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
||||||
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
||||||
|
--backends) BACKENDS_FILTER="$2"; shift 2 ;;
|
||||||
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
@@ -42,6 +44,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --skip-longctx Skip long-context tests"
|
echo " --skip-longctx Skip long-context tests"
|
||||||
echo " --max-size GB Only bench models up to this file size in GB"
|
echo " --max-size GB Only bench models up to this file size in GB"
|
||||||
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
||||||
|
echo " --backends LIST Comma-separated backends (default: llama-vulkan-radv)"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
@@ -94,15 +97,18 @@ declare -A BENCH_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||||
)
|
)
|
||||||
|
|
||||||
available_backends=()
|
available_backends=()
|
||||||
for tb in "${!BENCH_PATHS[@]}"; do
|
for tb in "${!BENCH_PATHS[@]}"; do
|
||||||
if echo "$existing" | grep -q "^${tb}$"; then
|
if echo "$existing" | grep -q "^${tb}$"; then
|
||||||
|
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -qFx "$tb"; then
|
||||||
available_backends+=("$tb")
|
available_backends+=("$tb")
|
||||||
log_success "Backend: $tb"
|
log_success "Backend: $tb"
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if (( ${#available_backends[@]} == 0 )); then
|
if (( ${#available_backends[@]} == 0 )); then
|
||||||
@@ -269,8 +275,8 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
|
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
-p "$CTX_PROMPT" -n "$TG_TOKENS" -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||||
-r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
-r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ source "$SCRIPT_DIR/../../lib/format.sh"
|
|||||||
|
|
||||||
MODEL_DIR="$(data_dir models)"
|
MODEL_DIR="$(data_dir models)"
|
||||||
TAG="run"
|
TAG="run"
|
||||||
BACKENDS_FILTER=""
|
BACKENDS_FILTER="llama-vulkan-radv"
|
||||||
MODELS_FILTER=""
|
MODELS_FILTER=""
|
||||||
SKIP_LONGCTX=false
|
SKIP_LONGCTX=false
|
||||||
MAX_SIZE_GB=0
|
MAX_SIZE_GB=0
|
||||||
@@ -99,13 +99,14 @@ declare -A BENCH_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||||
)
|
)
|
||||||
|
|
||||||
available_backends=()
|
available_backends=()
|
||||||
for tb in "${!BENCH_PATHS[@]}"; do
|
for tb in "${!BENCH_PATHS[@]}"; do
|
||||||
if echo "$existing" | grep -q "^${tb}$"; then
|
if echo "$existing" | grep -q "^${tb}$"; then
|
||||||
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then
|
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -qFx "$tb"; then
|
||||||
available_backends+=("$tb")
|
available_backends+=("$tb")
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@@ -130,7 +131,7 @@ for p in "${ALL_MODEL_PATHS[@]}"; do
|
|||||||
|
|
||||||
# Name filter
|
# Name filter
|
||||||
if [[ -n "$MODELS_FILTER" ]]; then
|
if [[ -n "$MODELS_FILTER" ]]; then
|
||||||
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
|
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qiF "$local_name"; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@@ -252,7 +253,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
-p "$CTX_PROMPT" -n "$TG_TOKENS" -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT_LC"
|
log_success "Done"; tail -3 "$OUT_LC"
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ log_header "Benchmark Setup"
|
|||||||
# ── 1. Check toolbox containers ──────────────────────────
|
# ── 1. Check toolbox containers ──────────────────────────
|
||||||
log_info "Checking toolbox containers..."
|
log_info "Checking toolbox containers..."
|
||||||
|
|
||||||
REQUIRED_TOOLBOXES=("llama-vulkan-radv" "llama-rocm-7.2")
|
REQUIRED_TOOLBOXES=("llama-vulkan-radv" "llama-rocm-7.2.1")
|
||||||
OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-vulkan-amdvlk")
|
OPTIONAL_TOOLBOXES=("llama-rocm-7.2" "llama-rocm-6.4.4" "llama-vulkan-amdvlk")
|
||||||
|
|
||||||
existing=$(detect_toolbox_names 2>/dev/null || true)
|
existing=$(detect_toolbox_names 2>/dev/null || true)
|
||||||
missing=()
|
missing=()
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ CTX_SIZE=131072
|
|||||||
PARALLEL=1
|
PARALLEL=1
|
||||||
MODEL=""
|
MODEL=""
|
||||||
NGRAM=false
|
NGRAM=false
|
||||||
|
NO_THINK=false
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
||||||
--parallel) PARALLEL="$2"; shift 2 ;;
|
--parallel) PARALLEL="$2"; shift 2 ;;
|
||||||
--ngram) NGRAM=true; shift ;;
|
--ngram) NGRAM=true; shift ;;
|
||||||
|
--no-think) NO_THINK=true; shift ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: launch.sh [OPTIONS]"
|
echo "Usage: launch.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --ctx N Context size (default: 131072)"
|
echo " --ctx N Context size (default: 131072)"
|
||||||
echo " --parallel N Parallel request slots (default: 1)"
|
echo " --parallel N Parallel request slots (default: 1)"
|
||||||
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
||||||
|
echo " --no-think Disable thinking/reasoning (faster for evals)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Presets (pass model filename):"
|
echo "Presets (pass model filename):"
|
||||||
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
||||||
@@ -76,6 +79,7 @@ declare -A SERVER_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-server"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -101,7 +105,7 @@ fi
|
|||||||
SERVER_ARGS=(
|
SERVER_ARGS=(
|
||||||
-ngl 99 # Full GPU offload
|
-ngl 99 # Full GPU offload
|
||||||
--no-mmap # Direct load, no mmap overhead
|
--no-mmap # Direct load, no mmap overhead
|
||||||
-fa # Flash attention
|
-fa on # Flash attention
|
||||||
-m "$TOOLBOX_MODEL_PATH"
|
-m "$TOOLBOX_MODEL_PATH"
|
||||||
-c "$CTX_SIZE" # Context size
|
-c "$CTX_SIZE" # Context size
|
||||||
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||||
@@ -110,6 +114,11 @@ SERVER_ARGS=(
|
|||||||
-np "$PARALLEL" # Parallel slots
|
-np "$PARALLEL" # Parallel slots
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Disable thinking mode (faster for evals)
|
||||||
|
if $NO_THINK; then
|
||||||
|
SERVER_ARGS+=(--reasoning-budget 0)
|
||||||
|
fi
|
||||||
|
|
||||||
# N-gram speculative decoding
|
# N-gram speculative decoding
|
||||||
if $NGRAM; then
|
if $NGRAM; then
|
||||||
SERVER_ARGS+=(
|
SERVER_ARGS+=(
|
||||||
@@ -126,6 +135,7 @@ log_info "Backend: $BACKEND"
|
|||||||
log_info "Context: $CTX_SIZE tokens"
|
log_info "Context: $CTX_SIZE tokens"
|
||||||
log_info "KV cache: q4_0/q4_0"
|
log_info "KV cache: q4_0/q4_0"
|
||||||
log_info "Parallel slots: $PARALLEL"
|
log_info "Parallel slots: $PARALLEL"
|
||||||
|
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
|
||||||
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
||||||
log_info "Port: $PORT"
|
log_info "Port: $PORT"
|
||||||
log_info "Endpoint: http://localhost:$PORT"
|
log_info "Endpoint: http://localhost:$PORT"
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "run-suite --help shows usage and exits 0" {
|
@test "run-suite --help shows usage and exits 0" {
|
||||||
@@ -22,6 +23,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--tag"
|
assert_output --partial "--tag"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher shows help with no args" {
|
@test "benchmark dispatcher shows help with no args" {
|
||||||
@@ -31,6 +33,18 @@ load test_helper.sh
|
|||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "serve --help shows usage and exits 0" {
|
||||||
|
run bash "$PROJECT_ROOT/bin/serve" --help
|
||||||
|
assert_success
|
||||||
|
assert_output --partial "Usage"
|
||||||
|
assert_output --partial "--model"
|
||||||
|
assert_output --partial "--ngram"
|
||||||
|
assert_output --partial "--no-think"
|
||||||
|
assert_output --partial "--ctx"
|
||||||
|
assert_output --partial "--port"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher passes --help through to baseline" {
|
@test "benchmark dispatcher passes --help through to baseline" {
|
||||||
|
|||||||
Reference in New Issue
Block a user