diff --git a/CLAUDE.md b/CLAUDE.md index e169353..bb671cf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,8 +41,14 @@ make verify # 9-point optimization checklist bin/audit --json | python3 -m json.tool # Verify JSON output is valid ``` +## Agentic Evaluation + +Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `data/venv/`. Eval frameworks: inspect-ai (all-in-one), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint (ollama or llama.cpp server). Model catalog at `configs/models.conf`. + ## External Resources All external links are centralized in [docs/references.md](docs/references.md). Key ones: - AMD ROCm Strix Halo guide (kernel params, GTT configuration) - Donato Capitella toolboxes (container images, benchmarks, VRAM estimator) +- Qwen3.5 model family (GGUF quants by Unsloth) +- Agentic eval frameworks (Inspect AI, EvalPlus, BFCL, BigCodeBench) diff --git a/Makefile b/Makefile index a38c47c..04b96d8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help audit audit-full monitor monitor-simple benchmark benchmark-baseline benchmark-compare optimize verify test +.PHONY: help audit audit-full monitor monitor-simple benchmark benchmark-baseline benchmark-compare optimize verify test agentic-setup agentic-quick agentic-full help: ## Show available commands @echo "Strix Halo Optimization Toolkit" @@ -57,6 +57,22 @@ verify: ## Post-optimization verification checklist rollback: ## Rollback optimizations @bash scripts/optimize/rollback.sh +# --- Agentic Evaluation --- +agentic-setup: ## Install agentic evaluation frameworks (inspect-ai, evalplus) + @bash bin/agentic setup + +agentic-quick: ## EvalPlus + IFEval quick eval (needs --model, ~1h) + @bash bin/agentic quick $(ARGS) + +agentic-code: ## Code generation eval: EvalPlus + BigCodeBench (~2-3h) + @bash bin/agentic code $(ARGS) + +agentic-tooluse: ## Tool/function calling eval: BFCL (~1-2h) + @bash bin/agentic tooluse $(ARGS) + +agentic-full: ## All agentic evaluations (~5-6h) + @bash bin/agentic full $(ARGS) + # --- Tests --- test: ## Run BATS test suite @bats tests/ diff --git a/README.md b/README.md index 01b5d2b..ed53b4f 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,12 @@ Each `[!!]` is an optimization opportunity. Run `make optimize` to address them. | `make optimize-vram` | BIOS VRAM guidance + GTT verification | | `make verify` | Post-optimization verification checklist | | `sudo make rollback` | Rollback optimizations | +| `make agentic-setup` | Install agentic eval frameworks (inspect-ai, evalplus) | +| `make agentic-quick ARGS="--model NAME"` | EvalPlus + IFEval (~1 hour) | +| `make agentic-code ARGS="--model NAME"` | Code generation evals (~2-3 hours) | +| `make agentic-tooluse ARGS="--model NAME"` | BFCL function calling eval (~1-2 hours) | +| `make agentic-full ARGS="--model NAME"` | All agentic evaluations (~5-6 hours) | +| `make test` | Run BATS test suite | ## Optimization Workflow @@ -107,6 +113,8 @@ See [docs/architecture.md](docs/architecture.md) for the full architecture, data | [docs/benchmarking.md](docs/benchmarking.md) | Benchmark methodology, test params, result interpretation | | [docs/bios-vram-guide.md](docs/bios-vram-guide.md) | HP ZBook BIOS configuration for VRAM | | [docs/troubleshooting.md](docs/troubleshooting.md) | Common issues and fixes | +| [docs/model-recommendations.md](docs/model-recommendations.md) | Qwen3.5 models, quantization, memory planning | +| [docs/agentic-benchmarks.md](docs/agentic-benchmarks.md) | Agentic evaluation frameworks and methodology | | [docs/references.md](docs/references.md) | External links: AMD docs, toolboxes, community resources | ## Contributing diff --git a/bin/agentic b/bin/agentic new file mode 100755 index 0000000..2718013 --- /dev/null +++ b/bin/agentic @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Agentic evaluation dispatcher +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "${1:-help}" in + setup) exec bash "$SCRIPT_DIR/scripts/agentic/setup.sh" ;; + run) exec bash "$SCRIPT_DIR/scripts/agentic/run-eval.sh" "${@:2}" ;; + quick) exec bash "$SCRIPT_DIR/scripts/agentic/run-eval.sh" --suite quick "${@:2}" ;; + code) exec bash "$SCRIPT_DIR/scripts/agentic/run-eval.sh" --suite code "${@:2}" ;; + tooluse) exec bash "$SCRIPT_DIR/scripts/agentic/run-eval.sh" --suite tooluse "${@:2}" ;; + full) exec bash "$SCRIPT_DIR/scripts/agentic/run-eval.sh" --suite full "${@:2}" ;; + *) + echo "Usage: agentic [options]" + echo "" + echo "Commands:" + echo " setup Install evaluation frameworks (inspect-ai, evalplus, bigcodebench)" + echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)" + echo " code EvalPlus + BigCodeBench (~2-3 hours)" + echo " tooluse BFCL function calling evaluation (~1-2 hours)" + echo " full All evaluations (~5-6 hours)" + echo " run Custom run (--suite SUITE --model NAME --endpoint URL)" + echo "" + echo "All commands require --model NAME. Examples:" + echo " agentic quick --model qwen3.5:35b-a3b-q8_0" + echo " agentic full --model qwen3-coder:30b-a3b --endpoint http://localhost:8080/v1" + exit 1 + ;; +esac diff --git a/configs/models.conf b/configs/models.conf new file mode 100644 index 0000000..3d6696f --- /dev/null +++ b/configs/models.conf @@ -0,0 +1,22 @@ +# Model catalog for benchmarking +# Format: NAME|HF_REPO|FILE|SIZE_GB|CATEGORY|DESCRIPTION +# +# Categories: smoke, standard, moe, dense, coding, agentic +# Download with: huggingface-cli download REPO FILE --local-dir data/models + +# ── Smoke tests (quick, small) ─────────────────────────── +qwen3-4b|unsloth/Qwen3-4B-GGUF|Qwen3-4B-Q4_K_M.gguf|3|smoke|Quick validation + +# ── Standard benchmarks ────────────────────────────────── +qwen3-14b|unsloth/Qwen3-14B-GGUF|Qwen3-14B-Q4_K_M.gguf|9|standard|Standard test model + +# ── Qwen3.5 MoE models (fast generation, best for 64GB) ─ +qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|Top pick: near-full precision, 3B active +qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|22|moe|Best quality/size ratio, 3B active + +# ── Qwen3.5 dense models ──────────────────────────────── +qwen3.5-27b-q4|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q4_K_M.gguf|17|dense|Dense 27B, quality-first +qwen3.5-27b-q8|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q8_0.gguf|29|dense|Dense 27B, max quality + +# ── Coding / agentic models ───────────────────────────── +qwen3-coder-30b-a3b|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf|18|agentic|Best for tool use + coding, 3B active diff --git a/docs/agentic-benchmarks.md b/docs/agentic-benchmarks.md new file mode 100644 index 0000000..d6914da --- /dev/null +++ b/docs/agentic-benchmarks.md @@ -0,0 +1,444 @@ +# Local Agentic Flow Benchmarks for Strix Halo + +Research summary: benchmarking agentic LLM capabilities on consumer hardware (AMD Strix Halo, Ryzen AI MAX+ 395, 64 GB unified memory) using llama.cpp, Ollama, and LM Studio. + +--- + +## Scope + +This document covers locally-runnable agentic benchmarks, evaluation frameworks, practical measurement approaches, and model recommendations (with emphasis on the Qwen family) for the Strix Halo platform. Cloud-only benchmarks that cannot accept a local OpenAI-compatible endpoint are out of scope. + +--- + +## 1. Agentic Benchmarks Runnable Locally + +### 1.1 Berkeley Function Calling Leaderboard (BFCL) + +**What it measures**: Function/tool calling accuracy across serial calls, parallel calls, multiple languages, and multi-turn agentic interactions. + +**Why it matters**: BFCL is the de facto standard for evaluating function-calling quality. Version 4 (2025) added holistic agentic evaluation with stateful multi-step reasoning. + +**Local setup**: +```bash +# Option A: pip package +pip install bfcl-eval + +# Option B: from source (more control) +git clone https://github.com/ShishirPatil/gorilla.git +cd gorilla/berkeley-function-call-leaderboard +pip install -e . +``` + +Evaluate a local model by pointing BFCL at any OpenAI-compatible endpoint (ollama, llama.cpp server, vLLM). The framework uses AST-based evaluation to verify function call correctness without executing them. + +- **Repository**: https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard +- **Leaderboard**: https://gorilla.cs.berkeley.edu/leaderboard.html +- **Paper**: Patil et al., "The Berkeley Function Calling Leaderboard (BFCL): From Tool Use to Agentic Evaluation of Large Language Models," ICML 2025. + +### 1.2 SWE-bench / SWE-bench Verified + +**What it measures**: Ability to resolve real GitHub issues by generating patches against actual repositories. + +**Why it matters**: The gold standard for evaluating coding agents. Tasks require understanding large codebases, multi-file edits, and test-driven validation. + +**Local setup**: Evaluation runs inside Docker containers with network isolation. Two primary agent scaffolds support local models: + +- **SWE-agent** (https://swe-agent.com): Install via pip, configure `config.toml` to point at a local OpenAI-compatible endpoint. There is also a dedicated open-weight model, SWE-bench/SWE-agent-LM-32B. +- **OpenHands** (https://github.com/OpenHands/OpenHands): `pip install openhands`, then `openhands serve`. Configure `config.toml` with your local model's `base_url`. + +**Hardware note**: SWE-bench evaluation requires an x86_64 machine with at least 120 GB free storage, 16 GB RAM, and 8 CPU cores for the Docker harness (separate from model inference). Models smaller than 32B parameters show significantly degraded instruction following on these tasks. + +- **Repository**: https://github.com/SWE-bench/SWE-bench +- **Paper**: Jimenez et al., "SWE-bench: Can Language Models Resolve Real-world Github Issues?" ICLR 2024. + +### 1.3 AgentBench + +**What it measures**: LLM-as-agent across 8 environments: OS interaction, database queries, knowledge graphs, card games, lateral thinking, house-holding, web shopping, and web browsing. + +**Why it matters**: The broadest multi-environment agent evaluation. Tests planning, reasoning, tool use, and decision-making in multi-turn open-ended settings. + +**Local setup**: The evaluation package is released at https://github.com/THUDM/AgentBench. It supports custom model endpoints. Open-source models up to 70B show a significant performance gap versus frontier commercial models, making it a useful diagnostic for understanding where local models fall short. + +- **Paper**: Liu et al., "AgentBench: Evaluating LLMs as Agents," ICLR 2024. + +### 1.4 GAIA (General AI Assistants) + +**What it measures**: Multi-step tasks requiring web search, document reading, calculation, and synthesis. 466 tasks that are trivially easy for humans (92% accuracy) but extremely challenging for AI. + +**Local setup**: Available on Hugging Face. Requires a model with tool-use capabilities (web search, file reading, calculator). Can be wired to a local model via smolagents or LangChain with local tool implementations. + +- **Paper**: Mialon et al., "GAIA: A Benchmark for General AI Assistants," ICLR 2024. + +### 1.5 DeepPlanning (Qwen) + +**What it measures**: Long-horizon agentic planning with verifiable constraints. Two domains: multi-day travel planning (9 APIs for flights, trains, hotels, restaurants, attractions) and multi-product shopping. + +**Why it matters**: Evaluates three critical agentic abilities: +1. Proactive information acquisition (actively calling APIs to discover hidden states) +2. Local constrained reasoning (step-level logic like brand matching) +3. Global constrained optimization (budget caps, multi-day time feasibility) + +**Local setup**: Open-sourced January 2026. Dataset at https://huggingface.co/datasets/Qwen/DeepPlanning. Evaluation code integrated into the Qwen-Agent framework. + +- **Paper**: "DeepPlanning: Benchmarking Long-Horizon Agentic Planning with Verifiable Constraints," arXiv:2601.18137, January 2026. + +### 1.6 Code Generation: EvalPlus (HumanEval+ / MBPP+) + +**What it measures**: Functional correctness of generated code. EvalPlus extends HumanEval by 80x and MBPP by 35x test cases. + +**Local setup** (direct Ollama support): +```bash +pip install evalplus + +# Run against a local Ollama model +evalplus.evaluate \ + --model "qwen3-coder:30b" \ + --dataset humaneval \ + --backend ollama \ + --base-url http://localhost:11434/v1 \ + --greedy +``` + +- **Repository**: https://github.com/evalplus/evalplus +- **Leaderboard**: https://evalplus.github.io/leaderboard.html + +### 1.7 BigCodeBench + +**What it measures**: 1,140 function-level tasks requiring composition of multiple function calls across 139 libraries. Average 5.6 test cases per task with 99% branch coverage. + +**Local setup**: Based on EvalPlus infrastructure; supports the same backends including Ollama and vLLM. + +- **Repository**: https://github.com/bigcode-project/bigcodebench +- **Paper**: "BigCodeBench: Benchmarking Code Generation Towards AGI," ICLR 2025. + +### 1.8 IFEval (Instruction Following Evaluation) + +**What it measures**: Compliance with programmatically verifiable instructions ("write more than 400 words," "mention AI at least 3 times"). No subjective judgment needed. + +**Local setup**: Available through lm-evaluation-harness and Inspect AI. Recent variants include IFEval-FC (function calling format compliance) and M-IFEval (multilingual). + +- **Paper**: Zhou et al., "Instruction-Following Evaluation for Large Language Models," arXiv:2311.07911, 2023. + +--- + +## 2. Local Agentic Evaluation Frameworks + +### 2.1 Inspect AI (UK AISI) + +The most comprehensive single framework for local agentic evaluation. + +**Key features**: +- 100+ pre-built evaluations including BFCL, GAIA, HumanEval, MBPP, IFEval, GSM8K +- Native support for tool calling: custom tools, MCP tools, built-in bash/python/web tools +- Web-based Inspect View for monitoring and visualizing evaluations +- VS Code extension for development +- Works with any OpenAI-compatible endpoint (ollama, llama.cpp, vLLM) + +```bash +pip install inspect-ai + +# Run BFCL evaluation against a local model +inspect eval inspect_evals/bfcl --model openai/local-model \ + --model-base-url http://localhost:11434/v1 +``` + +- **Repository**: https://github.com/UKGovernmentBEIS/inspect_ai +- **Evals collection**: https://github.com/UKGovernmentBEIS/inspect_evals +- **Documentation**: https://inspect.aisi.org.uk/ + +### 2.2 EleutherAI lm-evaluation-harness + +The standard academic framework. 60+ benchmarks including MMLU, HellaSwag, ARC, GSM8K, HumanEval. Serves as the backend for Hugging Face's Open LLM Leaderboard. + +**Local model support**: Works with HuggingFace models directly, OpenAI-compatible APIs, and custom backends. The `local-completions` and `local-chat-completions` model types support any local server. + +```bash +pip install lm-eval + +lm_eval --model local-chat-completions \ + --model_args model=qwen3-coder:30b,base_url=http://localhost:11434/v1 \ + --tasks humaneval,mbpp,ifeval \ + --batch_size auto +``` + +- **Repository**: https://github.com/EleutherAI/lm-evaluation-harness + +### 2.3 smolagents (Hugging Face) + +Lightweight agentic framework with two core agent types: +- **CodeAgent**: Generates and executes sandboxed Python code +- **ToolCallingAgent**: Calls external APIs and custom functions + +**Ollama integration** is first-class: +```python +from smolagents import CodeAgent, OllamaModel + +model = OllamaModel(model_id="qwen3-coder:30b") +agent = CodeAgent(tools=[], model=model) +agent.run("What is the 10th Fibonacci number?") +``` + +Supports custom tool definitions and evaluation harnesses. Model-agnostic design means any Ollama, llama.cpp, or LM Studio model works. + +- **Repository**: https://github.com/huggingface/smolagents + +### 2.4 Qwen-Agent + +Purpose-built for Qwen models with optimized tool-calling templates and parsers. + +**Key features**: +- Native MCP (Model Context Protocol) support +- Parallel, multi-step, and multi-turn function calls with automatic parsing +- Code interpreter, RAG, and Chrome extension built in +- DeepPlanning benchmark evaluation integrated + +```bash +pip install qwen-agent[mcp] +``` + +Configure tools via MCP configuration files. The framework handles tool-calling format differences between Qwen model versions automatically. + +- **Repository**: https://github.com/QwenLM/Qwen-Agent +- **Documentation**: https://qwenlm.github.io/Qwen-Agent/ + +### 2.5 LangGraph / CrewAI + +Both support local OpenAI-compatible endpoints. Comparative benchmarks (2026) show: + +- **LangGraph**: Lowest latency and token usage due to graph-based architecture that reduces redundant context passing. Preferred for production with deterministic control flow. Reached v1.0 GA in October 2025. +- **CrewAI**: ~40% faster from idea to working prototype. Higher token spend but simpler multi-agent orchestration. v1.10.1 with native MCP and A2A support. 44,600+ GitHub stars. + +Neither provides a built-in standardized benchmark harness, but both can be instrumented to measure task completion rates, tool-call accuracy, and latency. + +### 2.6 Throughput & Performance Benchmarking Tools + +| Tool | Focus | Backends | +|------|-------|----------| +| [ollama-benchmark](https://github.com/aidatatools/ollama-benchmark) | Tokens/s throughput via Ollama | Ollama | +| [llama-benchy](https://github.com/eugr/llama-benchy) | Multi-backend benchmarking (llama-bench style) | vLLM, SGLang, llama.cpp, etc. | +| [benchllama](https://github.com/srikanth235/benchllama) | Local LLM benchmarking | Ollama | +| [local-llm-bench](https://github.com/famstack-dev/local-llm-bench) | Engine comparison (MLX vs llama.cpp) | MLX, llama.cpp | +| llama-bench (built-in) | Raw inference performance | llama.cpp native | + +--- + +## 3. Practical Measurement Approaches + +### 3.1 Token Throughput in Multi-Turn Conversations + +Key metrics for agentic workloads on Strix Halo: + +| Metric | Definition | Target | +|--------|-----------|--------| +| Time to First Token (TTFT) | Delay before first token appears | <500ms for interactive use | +| Generation speed (tok/s) | Steady-state token output rate | >30 tok/s for usable agents | +| Prompt processing (tok/s) | Speed of ingesting context | Critical for large codebases | +| KV cache utilization | Memory consumed by conversation history | Scales with context length | + +**Strix Halo 64GB measured performance** (from community benchmarks): + +| Model | Quant | Gen tok/s | Prompt tok/s | VRAM Used | +|-------|-------|-----------|-------------|-----------| +| Qwen3-Coder-30B-A3B | Q4_K_M | ~52-71 | 5-47 (varies by context) | ~18 GB | +| Qwen3-30B-A3B (general) | Q4_K_M | ~52 | -- | ~18 GB | +| 70B dense models | Q4_K_M | ~5 | -- | ~40 GB | + +MoE models like Qwen3-30B-A3B are where 64GB unified memory shines -- only 3B parameters are active per token, so generation is fast despite the 30B total parameter count. + +### 3.2 Tool-Calling Accuracy Measurement + +A practical local test sequence: + +1. **BFCL subset**: Run the BFCL simple function calling tests first (serial single-function calls). If accuracy is below 80%, the model is not suitable for agentic use. +2. **Parallel function calling**: Test with BFCL parallel calling scenarios. Many smaller models fail here. +3. **Multi-turn stateful**: BFCL v3/v4 multi-turn tests or DeepPlanning scenarios. +4. **Format compliance**: IFEval-FC tests whether the model can produce correctly formatted JSON function calls consistently. + +### 3.3 Code Generation Benchmarks + +Recommended evaluation progression (increasing difficulty): + +1. **HumanEval+** via EvalPlus (164 problems, well-understood baseline) +2. **MBPP+** via EvalPlus (974 problems, broader coverage) +3. **HumanEval Pro / MBPP Pro** (self-invoking code generation, tests compositionality) +4. **BigCodeBench** (1,140 tasks across 139 libraries, tests real-world API usage) +5. **SWE-bench Verified** (full repository-level coding, requires agent scaffold) + +### 3.4 Composite Agentic Evaluation + +For a holistic view, run these in order: + +``` +Phase 1 - Baseline Quality: + EvalPlus HumanEval+ (code generation) + IFEval (instruction following) + BFCL simple (tool calling basics) + +Phase 2 - Agentic Capability: + BFCL v4 multi-turn (stateful tool use) + DeepPlanning (long-horizon planning) + BigCodeBench (multi-library code composition) + +Phase 3 - Full Agent Evaluation: + AgentBench (multi-environment) + SWE-bench Verified (real-world coding) +``` + +### 3.5 Measuring What Matters for Agents + +Beyond accuracy, measure: +- **Recovery from errors**: Does the model self-correct when a tool call returns an error? +- **Instruction adherence under pressure**: Does tool-calling format degrade as context grows? +- **Planning depth**: How many sequential tool calls can the model chain before losing coherence? +- **Token efficiency**: Total tokens consumed per successful task completion. + +--- + +## 4. Best Models for Agentic Use (Qwen Family Focus) + +### 4.1 Recommended for Strix Halo 64GB + +#### Tier 1: Primary Recommendation + +**Qwen3-Coder-30B-A3B-Instruct** (MoE: 30.5B total, 3.3B active) +- 128 experts, 8 activated per token +- 262K native context length +- Specially designed function-call format +- ~52-71 tok/s on Strix Halo (Q4_K_M, ~18 GB VRAM) +- Supports: Ollama, LM Studio, llama.cpp, KTransformers +- Available via: `ollama pull renchris/qwen3-coder:30b-gguf-unsloth` +- GGUF: https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF + +#### Tier 1 (Alternative): General-Purpose Agent + +**Qwen3.5-35B-A3B** (MoE: 35B total, 3B active) +- Hybrid architecture: Gated Delta Networks + sparse MoE +- 256K context, 201 languages +- BFCL-V4 scores competitive with much larger models +- Recommended settings: temperature=0.7, top_p=0.8, top_k=20, repetition_penalty=1.05 + +#### Tier 2: Smaller / Faster + +**Qwen3.5-9B** (Dense: 9B parameters) +- Matches GPT-OSS-120B (a model 13x its size) on GPQA Diamond (81.7 vs 71.5) +- Fits easily in 64GB with very long context +- Good for rapid prototyping and testing agent architectures +- Available via: `ollama pull qwen3.5:9b` + +#### Tier 3: Maximum Capability (fits in 64GB with quantization) + +**Qwen3-Coder-Next** (MoE: 80B total, 3B active) +- SWE-bench Verified: 70.6% (SWE-Agent scaffold) +- SWE-bench Pro: 44.3% (beats DeepSeek-V3.2 at 40.9) +- Requires >45GB for 4-bit quants; >30GB for 2-bit XL quants +- Fits on 64GB Strix Halo with Q4_K quantization (tight but feasible) +- GGUF: https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF +- Run via: `llama-server -hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q4_K_XL` + +### 4.2 Qwen Family Comparison for Agentic Tasks + +| Model | Type | Active Params | BFCL-V4 | SWE-bench | Best For | 64GB Feasible | +|-------|------|--------------|---------|-----------|----------|---------------| +| Qwen3-Coder-30B-A3B | MoE | 3.3B | Strong | Moderate | Tool calling, coding agents | Yes, comfortably | +| Qwen3.5-35B-A3B | MoE | 3B | Strong | -- | General agentic tasks | Yes, comfortably | +| Qwen3.5-9B | Dense | 9B | Good | -- | Fast prototyping, testing | Yes, easily | +| Qwen3-Coder-Next | MoE | 3B | Strong | 70.6% | Maximum coding capability | Yes, tight (Q4) | +| Qwen3.5-122B-A10B | MoE | 10B | 72.2 | -- | Best tool calling | Marginal (needs Q2-Q3) | +| Qwen3-Coder-480B-A35B | MoE | 35B | SOTA | SOTA open | Maximum performance | No (too large) | + +### 4.3 Non-Qwen Alternatives Worth Testing + +| Model | Parameters | Notable For | +|-------|-----------|-------------| +| GLM-4.7-Flash | 30B MoE (3B active) | Strong agentic performance, 128K context | +| DeepSeek-V3.2 | MoE | Competitive coding agent | +| Phi-4-Mini | 14B dense | Native function calling, small footprint | +| SWE-agent-LM-32B | 32B dense | Purpose-built for SWE-bench | + +### 4.4 Optimal Setup for Agentic Use on Strix Halo + +```bash +# 1. Start model server (llama.cpp for best AMD GPU utilization) +llama-server \ + -hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M \ + -ngl 99 \ + --ctx-size 32768 \ + --port 8080 + +# 2. Use Qwen-Agent for tool calling (optimized templates) +pip install qwen-agent[mcp] + +# 3. Or use smolagents for framework-agnostic evaluation +pip install smolagents +``` + +For the Qwen models specifically, Qwen-Agent is recommended because it encapsulates the correct tool-calling templates and parsers internally, avoiding format mismatches that degrade function calling accuracy. + +--- + +## 5. Open Questions / Limitations + +1. **Quantization impact on tool calling**: Most benchmark results are reported at full precision (BF16/FP16). Quantization to Q4_K_M or lower may disproportionately affect structured output quality (JSON formatting, argument types) versus general text generation. No systematic study exists for this on Strix Halo specifically. + +2. **Context length vs. accuracy tradeoff**: Agentic workflows accumulate long conversation histories. MoE models with 262K context windows are advertised but tool-calling accuracy at >32K tokens is poorly benchmarked for local models. + +3. **ROCm maturity**: AMD's ROCm stack has improved dramatically but is still not at CUDA parity. The optimal backend (llama.cpp Vulkan vs. llama.cpp ROCm vs. vLLM ROCm) varies by model architecture and workload type. + +4. **MoE scheduling on unified memory**: Strix Halo's unified memory architecture allows MoE models to split dense layers (GPU) and sparse experts (CPU RAM) efficiently, but optimal splitting strategies are not well-documented for agentic workloads where expert activation patterns may differ from typical chat use. + +5. **Benchmark saturation**: HumanEval and MBPP are approaching saturation for frontier models. BigCodeBench and SWE-bench provide better discrimination but are significantly harder to run locally. + +6. **Multi-agent evaluation**: Most benchmarks test single-agent performance. Multi-agent workflows (CrewAI, LangGraph multi-agent) lack standardized evaluation frameworks. + +--- + +## 6. Overlap Notes + +- **Throughput benchmarking** overlaps with `docs/benchmarking.md` (which covers llama-bench raw performance). This document focuses on agentic quality metrics rather than raw tok/s. +- **ROCm configuration** overlaps with `docs/optimization.md`. This document assumes the system is already optimized per that guide. +- **External links** should be consolidated into `docs/references.md` when this document is finalized. + +--- + +## Sources + +### Papers +- Patil et al., "The Berkeley Function Calling Leaderboard (BFCL)," ICML 2025 +- Liu et al., "AgentBench: Evaluating LLMs as Agents," ICLR 2024 +- Jimenez et al., "SWE-bench: Can Language Models Resolve Real-world Github Issues?" ICLR 2024 +- Mialon et al., "GAIA: A Benchmark for General AI Assistants," ICLR 2024 +- "DeepPlanning: Benchmarking Long-Horizon Agentic Planning with Verifiable Constraints," arXiv:2601.18137, Jan 2026 +- "Qwen3 Technical Report," arXiv:2505.09388, May 2025 +- "Qwen3-Coder-Next Technical Report," arXiv:2603.00729, March 2026 +- "HumanEval Pro and MBPP Pro," ACL 2025 Findings +- "BigCodeBench: Benchmarking Code Generation Towards AGI," ICLR 2025 +- Zhou et al., "Instruction-Following Evaluation for Large Language Models," arXiv:2311.07911, 2023 + +### Repositories & Tools +- [BFCL](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) +- [SWE-bench](https://github.com/SWE-bench/SWE-bench) +- [AgentBench](https://github.com/THUDM/AgentBench) +- [EvalPlus](https://github.com/evalplus/evalplus) +- [BigCodeBench](https://github.com/bigcode-project/bigcodebench) +- [Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai) +- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) +- [smolagents](https://github.com/huggingface/smolagents) +- [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) +- [OpenHands](https://github.com/OpenHands/OpenHands) +- [Qwen3-Coder-30B-A3B GGUF](https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF) +- [Qwen3-Coder-Next GGUF](https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF) +- [DeepPlanning Dataset](https://huggingface.co/datasets/Qwen/DeepPlanning) + +### Strix Halo Benchmarks +- [Strix Halo GPU LLM Performance Tests (Framework Community)](https://community.frame.work/t/amd-strix-halo-ryzen-ai-max-395-gpu-llm-performance-tests/72521) +- [Strix Halo Benchmark Results (Level1Techs)](https://forum.level1techs.com/t/strix-halo-ryzen-ai-max-395-llm-benchmark-results/233796) +- [Strix Halo Toolboxes Benchmarks](https://kyuz0.github.io/amd-strix-halo-toolboxes/) +- [Qwen3-Coder-30B Strix Halo Benchmark](https://github.com/pablo-ross/strix-halo-gmktec-evo-x2/blob/main/QWEN3-CODER-30B_BENCHMARK.md) +- [LLM Tracker - Strix Halo](https://llm-tracker.info/AMD-Strix-Halo-(Ryzen-AI-Max+-395)-GPU-Performance) + +### Guides +- [Qwen3-Coder-Next Local Guide (DEV Community, 2026)](https://dev.to/sienna/qwen3-coder-next-the-complete-2026-guide-to-running-powerful-ai-coding-agents-locally-1k95) +- [Qwen3-Coder Local Setup (Unsloth)](https://unsloth.ai/docs/models/tutorials/qwen3-coder-how-to-run-locally) +- [Qwen llama.cpp Documentation](https://qwen.readthedocs.io/en/latest/run_locally/llama.cpp.html) +- [smolagents + Ollama (Medium)](https://medium.com/@abonia/building-practical-local-ai-agents-with-smolagents-ollama-f92900c51897) +- [Inspect AI Documentation](https://inspect.aisi.org.uk/) diff --git a/docs/model-recommendations.md b/docs/model-recommendations.md new file mode 100644 index 0000000..0faefa6 --- /dev/null +++ b/docs/model-recommendations.md @@ -0,0 +1,488 @@ +# Qwen 3.5 Model Family: Research Summary for Strix Halo (64GB) + +**Date**: 2026-03-26 +**Target Hardware**: AMD Ryzen AI MAX+ 395 / Radeon 8060S (gfx1151), 64 GB unified LPDDR5x, Fedora 43 +**Focus**: GGUF quantized models for llama.cpp inference + +--- + +## Scope + +This report covers the Qwen3.5 model family (released February-March 2026) with emphasis +on GGUF quantization options, file sizes, memory fit analysis for 64GB unified memory, +GGUF quantizer comparison (Unsloth vs bartowski vs others), Unsloth Studio capabilities, +and LM Studio backend support on AMD Strix Halo. Out of scope: cloud API pricing, +full-precision training, non-GGUF formats (AWQ, GPTQ, EXL2). + +--- + +## 1. Qwen3.5 Model Family Overview + +Released mid-February 2026 (medium/large) and March 2, 2026 (small), licensed Apache 2.0. +All models share the Gated DeltaNet hybrid architecture: a 3:1 ratio of linear attention +(Gated DeltaNet) to full softmax attention blocks. Native 262K context window, extensible +to 1,010,000 tokens via YaRN scaling. Supports 201 languages. Native multimodal +(vision+language). Thinking/non-thinking hybrid mode. + +| Model | Type | Total Params | Active Params | Architecture | +|-------|------|-------------|---------------|--------------| +| Qwen3.5-397B-A17B | MoE | 397B | 17B | 256 experts, 8 routed + 1 shared | +| Qwen3.5-122B-A10B | MoE | 122B | 10B | 256 experts, 8 routed + 1 shared | +| **Qwen3.5-35B-A3B** | **MoE** | **35B** | **3B** | **256 experts, 8 routed + 1 shared** | +| **Qwen3.5-27B** | **Dense** | **27B** | **27B** | **Full activation** | +| Qwen3.5-9B | Dense | 9B | 9B | Gated DeltaNet hybrid | +| Qwen3.5-4B | Dense | 4B | 4B | Gated DeltaNet hybrid | +| Qwen3.5-2B | Dense | 2B | 2B | Gated DeltaNet hybrid | +| Qwen3.5-0.8B | Dense | 0.8B | 0.8B | Gated DeltaNet hybrid | + +--- + +## 2. Qwen3.5-35B-A3B (MoE) -- Detailed Analysis + +### Architecture Specs + +- Hidden dimension: 2048 +- Token embedding: 248,320 (padded) +- Layers: 40 +- Hidden layout: 10 x (3 x (Gated DeltaNet -> MoE) -> 1 x (Gated Attention -> MoE)) +- MoE: 256 total experts, 8 routed + 1 shared active, expert intermediate dim 512 +- Linear attention heads: 32 (V), 16 (QK), head dim 128 +- Gated attention heads: 16 (Q), 2 (KV), head dim 256 +- BF16 model size: 69.4 GB + +### GGUF Quantizations (Unsloth) + +Source: [unsloth/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF) +Updated March 5, 2026 with improved imatrix data. + +| Quantization | Size (GB) | Fits 64GB? | Notes | +|-------------|-----------|------------|-------| +| UD-IQ2_XXS | 10.7 | Yes | Ultra-compressed, quality loss | +| UD-IQ2_M | 11.4 | Yes | | +| UD-Q2_K_XL | 12.2 | Yes | | +| UD-IQ3_XXS | 13.1 | Yes | | +| UD-IQ3_S | 13.6 | Yes | | +| Q3_K_S | 15.3 | Yes | | +| Q3_K_M | 16.4 | Yes | | +| UD-Q3_K_XL | 16.6 | Yes | | +| UD-IQ4_XS | 17.5 | Yes | | +| UD-IQ4_NL | 17.8 | Yes | | +| Q4_K_S | 20.7 | Yes | | +| MXFP4_MOE | 21.6 | Yes | MoE-optimized mixed precision | +| Q4_K_M | 22.0 | **Yes** | **Recommended sweet spot** | +| UD-Q4_K_XL | 22.2 | Yes | Dynamic 2.0, best 4-bit | +| Q5_K_S | 24.8 | Yes | | +| Q5_K_M | 26.2 | Yes | | +| UD-Q5_K_XL | 26.4 | Yes | | +| UD-Q6_K_S | 28.5 | Yes | | +| Q6_K | 28.9 | Yes | | +| UD-Q6_K_XL | 32.1 | Yes | | +| Q8_0 | 36.9 | Yes | High quality, fits with room | +| UD-Q8_K_XL | 48.7 | Yes* | Tight -- ~15GB for KV cache | +| BF16 | 69.4 | **No** | Exceeds 64GB | + +**Key finding**: Every quantization except BF16 fits in 64GB. Even Q8_0 at 36.9 GB +leaves ~27 GB for KV cache and OS overhead, which is excellent. The MoE architecture +(only 3B active params) means token generation is fast relative to total model size. + +### Benchmark Results (Official, from Model Card) + +| Benchmark | Qwen3.5-35B-A3B | GPT-5-mini | Notes | +|-----------|-----------------|-----------|-------| +| MMLU-Pro | 85.3 | 83.7 | Outperforms | +| C-Eval | 90.2 | 82.2 | Outperforms | +| GPQA Diamond | 84.2 | 82.8 | Outperforms | +| SWE-bench Verified | 69.2 | 72.0 | Slightly behind | +| LiveCodeBench v6 | 74.6 | 80.5 | Behind on coding | +| MMMU (vision) | 81.4 | 79.0 | Outperforms | +| MathVision | 83.9 | 71.9 | Strongly outperforms | +| VideoMME (w/ sub.) | 86.6 | 83.5 | Outperforms | + +### Strix Halo Performance Estimates + +Based on Qwen3-30B-A3B benchmarks (similar architecture, predecessor): + +| Backend | pp512 (t/s) | tg128 (t/s) | Context | +|---------|-------------|-------------|---------| +| Vulkan RADV | ~755 | ~85 | Short | +| Vulkan AMDVLK | ~742 | ~82 | Short | +| ROCm hipBLASlt | ~652 | ~64 | Short | +| ROCm rocWMMA (tuned) | ~659 | ~68 | Short | +| Vulkan RADV | ~17 | ~13 | 130K | +| ROCm hipBLASlt | ~40 | ~5 | 130K | + +**Key insight**: Vulkan wins on short-context token generation. ROCm wins on +long-context prompt processing. For interactive chat (short-medium context), +Vulkan RADV is the best backend on Strix Halo. + +--- + +## 3. Qwen3.5-27B (Dense) -- Detailed Analysis + +Source: [unsloth/Qwen3.5-27B-GGUF](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) + +The only dense (non-MoE) model in the medium range. All 27B parameters activate on +every forward pass, meaning slower token generation than 35B-A3B despite being +"smaller" in total params. BF16 size: 53.8 GB. + +### GGUF Quantizations (Unsloth) + +| Quantization | Size (GB) | Fits 64GB? | Notes | +|-------------|-----------|------------|-------| +| UD-IQ2_XXS | 8.57 | Yes | | +| UD-IQ2_M | 10.2 | Yes | | +| UD-Q2_K_XL | 11.2 | Yes | | +| UD-IQ3_XXS | 11.5 | Yes | | +| Q3_K_S | 12.3 | Yes | | +| Q3_K_M | 13.5 | Yes | | +| UD-Q3_K_XL | 14.4 | Yes | | +| IQ4_XS | 15.0 | Yes | | +| Q4_0 | 15.7 | Yes | | +| IQ4_NL | 15.7 | Yes | | +| Q4_K_S | 15.8 | Yes | | +| Q4_K_M | 16.7 | **Yes** | **Recommended** | +| UD-Q4_K_XL | 17.6 | Yes | Dynamic 2.0 | +| Q4_1 | 17.2 | Yes | | +| Q5_K_S | 18.9 | Yes | | +| Q5_K_M | 19.6 | Yes | | +| UD-Q5_K_XL | 20.2 | Yes | | +| Q6_K | 22.5 | Yes | | +| UD-Q6_K_XL | 25.7 | Yes | | +| Q8_0 | 28.6 | Yes | Plenty of room | +| UD-Q8_K_XL | 35.5 | Yes | Good quality + headroom | +| BF16 | 53.8 | Yes* | Tight -- only ~10GB for KV cache | + +**Key finding**: All quantizations fit in 64GB, including BF16 (barely). However, +because this is a dense model with 27B active params, token generation will be +significantly slower than 35B-A3B (which only activates 3B). For interactive use on +Strix Halo, the 35B-A3B MoE is likely the better choice despite being larger on disk. + +### 35B-A3B vs 27B: Which to Run? + +| Factor | 35B-A3B (MoE) | 27B (Dense) | +|--------|---------------|-------------| +| Active params | 3B | 27B | +| Token gen speed | ~85 t/s (Vulkan) | ~10-15 t/s (estimated) | +| Quality (MMLU-Pro) | 85.3 | Comparable | +| Memory (Q4_K_M) | 22.0 GB | 16.7 GB | +| Memory (Q8_0) | 36.9 GB | 28.6 GB | +| Best for | Interactive chat, speed | Batch processing, quality | + +**Recommendation**: For interactive inference on 64GB Strix Halo, strongly prefer +Qwen3.5-35B-A3B. The MoE architecture is ideal for unified memory systems since +only 3B params are active per token, yielding much faster generation despite the +larger total weight file. + +--- + +## 4. Qwen3.5-122B-A10B (MoE) -- Stretch Goal + +Source: [unsloth/Qwen3.5-122B-A10B-GGUF](https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF) + +BF16 size: 244 GB. This is the next tier up from 35B-A3B. + +### Quantizations That Fit 64GB + +| Quantization | Size (GB) | Fit? | Notes | +|-------------|-----------|------|-------| +| UD-IQ1_M | 34.2 | Yes | 1-bit, quality concerns | +| UD-IQ2_XXS | 36.6 | Yes | Very compressed | +| UD-IQ2_M | 39.1 | Yes | | +| UD-Q2_K_XL | 41.8 | Yes | | +| UD-IQ3_XXS | 44.7 | Yes | | +| UD-IQ3_S | 46.6 | Yes* | Tight with KV cache | +| Q3_K_S | 52.5 | Marginal | Very little KV headroom | +| Q3_K_M | 56.4 | No | Leaves <8GB for everything else | +| Q4_K_M+ | 76.5+ | No | Does not fit | + +**Warning**: Q3-level quantization of 122B has been reported to produce garbled output, +infinite repetition, and failures on tool calls and code generation. The UD-Q2_K_XL +(41.8 GB) is the recommended minimum viable quantization. + +**Verdict**: Possible at 2-bit, but risky. Quality at IQ2 level on a 122B MoE model is +largely untested for production use. The 35B-A3B at Q8_0 (36.9 GB) is likely higher +quality than 122B at IQ2 (36.6 GB) and much safer. Not recommended for 64GB systems +unless you specifically need the 10B active parameter count. + +--- + +## 5. Qwen3.5 Small Models (Worth Benchmarking) + +### Qwen3.5-9B + +The standout small model. Outperforms models 3-13x its size: +- GPQA Diamond: 81.7 (vs GPT-OSS-120B: 71.5) +- HMMT Feb 2025: 83.2 +- MMMU-Pro: 70.1 (beats Gemini 2.5 Flash-Lite at 59.7) + +At Q4_K_M, the 9B model needs roughly 6-7 GB. Runs comfortably on any hardware. +Useful as a draft model for speculative decoding with the 35B-A3B. + +### Qwen3.5-4B + +Performance close to the previous Qwen3-80B-A3B (20x larger). Excellent for +on-device/edge tasks. ~3 GB at Q4_K_M. + +--- + +## 6. Best GGUF Quantizers: Unsloth vs bartowski vs Others + +### Providers Compared + +| Provider | Approach | Strengths | +|----------|----------|-----------| +| **Unsloth** | Dynamic 2.0: per-layer adaptive quantization, 1.5M+ token calibration dataset | Best at low bit-rates (Q2, Q3), model-specific tuning, fast updates | +| **bartowski** | Custom imatrix calibration, upstream llama.cpp PR for improved tensor recipes | Lower KLD at Q4_K_M in some tests, stable quality | +| **noctrex** | MXFP4 for MoE experts + Q8/BF16 for rest | Specialized for MoE models | +| **ubergarm** | Standard llama.cpp quantization | Reliable baseline | +| **AesSedai** | imatrix-based | Good coverage, sometimes outperformed by Unsloth Dynamic | +| **mradermacher** | Mass-produced quants across many models | Broad coverage, less specialized | + +### Head-to-Head: Unsloth vs bartowski + +On standard KLD benchmarks (Qwen QwQ-32B comparison): +- bartowski Q4_K_M: 0.0087 KLD +- Unsloth Q4_K_M: 0.0222 KLD +- bartowski IQ4_XS: 0.0127 KLD at 4.93 GiB + +However, on real-world task evaluations (LiveCodeBench v6, MMLU Pro), Unsloth Dynamic +IQ2_XXS outperformed AesSedai IQ3_S despite being 11GB smaller -- demonstrating that +KLD/perplexity alone do not predict task performance. + +### Recommendation + +- **Q4 and above**: bartowski and Unsloth are both excellent. bartowski may have slightly + lower KLD at Q4_K_M. Either is a safe choice. +- **Q3 and below**: Unsloth Dynamic 2.0 (UD- prefix) is the clear winner. The per-layer + adaptive approach preserves critical layers at higher precision. +- **MoE-specific**: noctrex MXFP4_MOE is worth testing if you want pure MoE-optimized + quantization. +- **Overall**: For Qwen3.5-35B-A3B, use **Unsloth UD-Q4_K_XL** (22.2 GB) or + **Q8_0** (36.9 GB) for maximum quality. For bartowski, use their Q4_K_M. + +### imatrix Note + +All modern GGUF quantizers now use imatrix (importance matrix) calibration. This adds +5-10% inference overhead but significantly improves quality at low bit-rates. The +calibration dataset matters: Unsloth uses 1.5M+ hand-curated tokens; bartowski uses +different calibration texts optimized for different use cases. + +--- + +## 7. Unsloth Studio + +### What It Is + +Unsloth Studio is an open-source, no-code web UI for training and running LLMs locally. +Released March 17, 2026 (beta). Dual-licensed: Apache 2.0 (core) + AGPL-3.0 (UI). + +### Installation + +```bash +# macOS, Linux, WSL +curl -fsSL https://unsloth.ai/install.sh | sh + +# Launch +unsloth studio -H 0.0.0.0 -p 8888 +``` + +### Capabilities + +| Feature | Details | +|---------|---------| +| **Inference** | Run GGUF and safetensor models with tool-calling, web search, OpenAI-compatible API | +| **Fine-tuning** | SFT, GRPO (RL), 500+ models, 2x faster, 70% less VRAM | +| **Data Recipes** | Auto-create datasets from PDF, CSV, JSON, DOCX, TXT | +| **Model Arena** | Side-by-side comparison of two models | +| **Export** | Save to GGUF or safetensors | +| **Multimodal** | Text, vision, TTS audio, embedding models | + +### Platform Support + +| Platform | Inference | Training | +|----------|-----------|----------| +| Linux (NVIDIA) | Yes | Yes | +| Linux (AMD) | Yes | Coming soon | +| Linux (CPU) | Yes | No | +| macOS | Yes (CPU only) | Coming (MLX) | +| Windows | Yes | Yes | + +### Relevance for Strix Halo + +Unsloth Studio provides inference via llama.cpp backend, so it should work on Strix Halo +for **running** models. Training requires NVIDIA or Intel GPUs currently, so fine-tuning +is not yet supported on AMD. The inference component is essentially a nice web UI wrapper +around llama.cpp, similar to LM Studio but with integrated training capabilities. + +**Verdict**: Useful for inference on Strix Halo. Not yet useful for training on AMD. +If you only need inference, LM Studio or raw llama.cpp may be simpler. If you want +training + inference in one tool (when AMD support arrives), Unsloth Studio is worth +watching. + +--- + +## 8. LM Studio on AMD Strix Halo + +### Backend Status + +| Backend | Status | Notes | +|---------|--------|-------| +| **Vulkan** | **Working, recommended** | Best for general inference, no special config needed | +| ROCm | Partially broken | gfx1151 declared supported but data files missing, crashes on inference | +| CPU | Working | Slow fallback | + +### Vulkan Configuration + +LM Studio with Vulkan is the most reliable path on Strix Halo: + +```json +{ + "llm.gpu.backend": "vulkan", + "llm.gpu.device": "auto", + "llm.gpu.layers": -1 +} +``` + +Verify GPU detection: `vulkaninfo | grep "GPU id"` + +An automated installer exists: [smarttechlabs-projects/strix-halo-lmstudio](https://github.com/smarttechlabs-projects/strix-halo-lmstudio) + +### Performance Expectations (LM Studio / Vulkan, 128GB system) + +| Model Size | Quant | Throughput | +|-----------|-------|-----------| +| 7B | Q4 | 30-40 t/s | +| 13B | Q4 | 20-30 t/s | +| 30B MoE | Q4 | ~50+ t/s (MoE advantage) | +| 70B | Q4 | 5-8 t/s | + +For a 64GB system, expect similar per-token speeds but with lower maximum context +lengths before memory pressure kicks in. + +### ROCm Status and Future + +AMD's Ryzen AI Halo Mini PC (Q2 2026) will ship with ROCm 7.2.2 optimization for +LM Studio. As of January 2026, stable ROCm+Linux configurations exist for Strix Halo +(documented at Framework Community). The gfx1151 ROCm issue in LM Studio specifically +is a packaging problem (missing data files), not a fundamental incompatibility. + +For now: use **Vulkan for short-medium context**, or build **llama.cpp from source +with ROCm** for long-context workloads (where Flash Attention matters). + +### LM Studio Unsloth Dynamic 2.0 Note + +There was a reported issue (GitHub #1594) where Unsloth Dynamic 2.0 (UD-) GGUF variants +were not shown in LM Studio's download options. Verify that LM Studio is updated to +the latest version, or download the GGUF files manually from HuggingFace and load +them directly. + +--- + +## 9. Recommended Configurations for 64GB Strix Halo + +### Primary: Qwen3.5-35B-A3B (MoE) + +| Use Case | Quantization | Size | KV Budget | Context Est. | +|----------|-------------|------|-----------|-------------| +| Maximum quality | Q8_0 | 36.9 GB | ~25 GB | ~32K-65K | +| Best balance | UD-Q4_K_XL | 22.2 GB | ~40 GB | ~65K-131K | +| Maximum context | UD-IQ3_XXS | 13.1 GB | ~49 GB | ~131K+ | +| Speed test | Q4_K_M | 22.0 GB | ~40 GB | ~65K-131K | + +### Secondary: Qwen3.5-27B (Dense) + +| Use Case | Quantization | Size | KV Budget | Notes | +|----------|-------------|------|-----------|-------| +| Quality comparison | Q8_0 | 28.6 GB | ~33 GB | Slower gen than 35B-A3B | +| Balanced | Q4_K_M | 16.7 GB | ~45 GB | | + +### Quick Reference: Qwen3.5-9B (Small/Draft) + +| Use Case | Quantization | Size | +|----------|-------------|------| +| Speculative decoding draft | Q4_K_M | ~6 GB | +| Standalone small model | Q8_0 | ~10 GB | + +--- + +## 10. Sampling Parameters (Official Recommendations) + +### Thinking Mode (General) +- Temperature: 1.0 +- Top-p: 0.95 +- Top-k: 20 +- Min-p: 0.0 +- Presence penalty: 1.5 +- Max output: 32,768 tokens (general) or 81,920 (math/coding) + +### Thinking Mode (Coding) +- Temperature: 0.6 +- Top-p: 0.95 +- Top-k: 20 +- Presence penalty: 0.0 + +### Non-Thinking / Instruct Mode +- Temperature: 0.7 +- Top-p: 0.8 +- Top-k: 20 +- Presence penalty: 1.5 + +### Best Practices +- Maintain minimum 128K context to preserve thinking capabilities +- Exclude thinking content from multi-turn conversation history +- For math: "Please reason step by step, and put your final answer within \boxed{}." +- For multiple choice: request JSON output like {"answer": "C"} + +--- + +## 11. Open Questions / Limitations + +1. **Qwen3.5 on gfx1151 ROCm**: LM Studio's ROCm backend crashes on Strix Halo due + to missing gfx1151 data files. Building llama.cpp from source with ROCm 7.x works + but requires manual setup. + +2. **Vulkan long-context degradation**: Vulkan performance drops significantly beyond + ~4K context on Strix Halo. ROCm with Flash Attention is needed for long-context + workloads, creating a backend choice dilemma. + +3. **Quantizer quality debate**: KLD and perplexity metrics do not always predict + real-world task performance. The "best" quantizer depends on the specific use case. + More task-based evaluation is needed. + +4. **122B-A10B viability at 64GB**: Only fits at 2-bit or aggressive 3-bit. Quality + at these compression levels for a 122B MoE is not well-characterized. + +5. **Unsloth Studio AMD training**: Not yet supported. Timeline unclear ("coming soon"). + +6. **Multi-token Prediction (MTP)**: Qwen3.5 supports MTP for faster generation, but + llama.cpp support status for this feature on the MoE variants needs verification. + +7. **Speculative decoding**: Qwen3.5-9B as a draft model for 35B-A3B has been discussed + but needs benchmarking on Strix Halo specifically. + +--- + +## Sources + +- [Qwen/Qwen3.5-35B-A3B Model Card](https://huggingface.co/Qwen/Qwen3.5-35B-A3B) +- [QwenLM/Qwen3.5 GitHub](https://github.com/QwenLM/Qwen3.5) +- [unsloth/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF) +- [unsloth/Qwen3.5-27B-GGUF](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) +- [unsloth/Qwen3.5-122B-A10B-GGUF](https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF) +- [bartowski/Qwen_Qwen3.5-35B-A3B-GGUF](https://huggingface.co/bartowski/Qwen_Qwen3.5-35B-A3B-GGUF) +- [bartowski/Qwen_Qwen3.5-27B-GGUF](https://huggingface.co/bartowski/Qwen_Qwen3.5-27B-GGUF) +- [noctrex/Qwen3.5-35B-A3B-MXFP4_MOE-GGUF](https://huggingface.co/noctrex/Qwen3.5-35B-A3B-MXFP4_MOE-GGUF) +- [Unsloth Dynamic 2.0 GGUFs Documentation](https://unsloth.ai/docs/basics/unsloth-dynamic-2.0-ggufs) +- [Qwen3.5 GGUF Benchmarks (Unsloth)](https://unsloth.ai/docs/models/qwen3.5/gguf-benchmarks) +- [Unsloth Studio Documentation](https://unsloth.ai/docs/new/studio) +- [Qwen3.5 Local Running Guide (Unsloth)](https://unsloth.ai/docs/models/qwen3.5) +- [Summary of Qwen3.5 GGUF Evaluations (kaitchup)](https://kaitchup.substack.com/p/summary-of-qwen35-gguf-evaluations) +- [LM Studio Vulkan on Strix Halo (SmartTechLabs)](https://www.smarttechlabs.de/blog/2026-01-14-lmstudio-strix-halo/) +- [LM Studio on Ryzen AI](https://lmstudio.ai/ryzenai) +- [Strix Halo llama.cpp Performance Wiki](https://strixhalo.wiki/AI/llamacpp-performance) +- [AMD Strix Halo Backend Benchmarks](https://kyuz0.github.io/amd-strix-halo-toolboxes/) +- [Strix Halo LLM Optimization (hardware-corner.net)](https://www.hardware-corner.net/strix-halo-llm-optimization/) +- [Qwen3.5 Small Models (Artificial Analysis)](https://artificialanalysis.ai/articles/qwen3-5-small-models) +- [Qwen 3.5 9B Beats 120B Models (VentureBeat)](https://venturebeat.com/technology/alibabas-small-open-source-qwen3-5-9b-beats-openais-gpt-oss-120b-and-can-run) +- [AMD ROCm 7 Strix Halo Performance (Phoronix)](https://www.phoronix.com/review/amd-rocm-7-strix-halo/4) +- [Qwen3.5 Blog (qwen.ai)](https://qwen.ai/blog?id=qwen3.5) diff --git a/docs/references.md b/docs/references.md index 41fd423..e95d50a 100644 --- a/docs/references.md +++ b/docs/references.md @@ -43,6 +43,24 @@ The most comprehensive community resource for Strix Halo LLM optimization. - [vLLM](https://github.com/vllm-project/vllm) — High-throughput serving - [llama-benchy](https://github.com/eugr/llama-benchy) — Multi-backend LLM benchmarking +## Qwen3.5 Models (GGUF) + +- [unsloth/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF) — Top pick for 64GB Strix Halo (MoE, 3B active) +- [unsloth/Qwen3.5-27B-GGUF](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) — Dense 27B +- [unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF](https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF) — Best for agentic/coding +- [Qwen3.5 Official](https://github.com/QwenLM/Qwen3.5) — Model family overview +- [Unsloth Dynamic 2.0](https://unsloth.ai/docs/basics/unsloth-dynamic-2.0-ggufs) — Adaptive quantization methodology +- [Unsloth Studio](https://unsloth.ai/docs/new/studio) — Training + inference UI (beta) + +## Agentic Evaluation + +- [Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai) — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA) +- [EvalPlus](https://github.com/evalplus/evalplus) — HumanEval+ / MBPP+ with native ollama support +- [BigCodeBench](https://github.com/bigcode-project/bigcodebench) — 1,140 coding tasks across 139 libraries +- [BFCL](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) — Berkeley Function Calling Leaderboard +- [SWE-bench](https://github.com/princeton-nlp/SWE-bench) — Real GitHub issue resolution +- [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) — Optimized agentic framework for Qwen models + ## AMD GPU Profiling - [Radeon GPU Profiler (RGP)](https://gpuopen.com/rgp/) — Hardware-level Vulkan/HIP profiling diff --git a/scripts/agentic/run-eval.sh b/scripts/agentic/run-eval.sh new file mode 100644 index 0000000..27803c8 --- /dev/null +++ b/scripts/agentic/run-eval.sh @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# Run agentic evaluations against a local LLM server +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" + +VENV_DIR="$(data_dir venv)" +EVAL_DIR="$(data_dir evals)" + +# ── Argument parsing ───────────────────────────────────── +SUITE="quick" +MODEL="" +ENDPOINT="http://localhost:11434/v1" # ollama default OpenAI-compat endpoint +PROVIDER="openai" + +while [[ $# -gt 0 ]]; do + case "$1" in + --suite|-s) SUITE="$2"; shift 2 ;; + --model|-m) MODEL="$2"; shift 2 ;; + --endpoint|-e) ENDPOINT="$2"; shift 2 ;; + --help|-h) + echo "Usage: run-eval.sh [--suite quick|full|code|tooluse] [--model NAME] [--endpoint URL]" + echo "" + echo "Suites:" + echo " quick EvalPlus HumanEval+ + IFEval (~1 hour)" + echo " code EvalPlus + BigCodeBench (~2-3 hours)" + echo " tooluse BFCL function calling (~1-2 hours)" + echo " full All of the above (~5-6 hours)" + echo "" + echo "Options:" + echo " --model Model name as known by the server (e.g., qwen3.5:35b-a3b)" + echo " --endpoint OpenAI-compatible endpoint (default: http://localhost:11434/v1)" + exit 0 ;; + *) log_warn "Unknown argument: $1"; shift ;; + esac +done + +# ── Validation ─────────────────────────────────────────── +if [[ -z "$MODEL" ]]; then + log_error "Model name required. Use --model NAME" + log_info "Examples:" + log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)" + log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)" + exit 1 +fi + +if [[ ! -f "$VENV_DIR/bin/activate" ]]; then + log_error "Virtual environment not found. Run: make agentic-setup" + exit 1 +fi +source "$VENV_DIR/bin/activate" + +# Check server is reachable +if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then + # Try ollama native endpoint + if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then + log_info "Ollama detected, using OpenAI-compat endpoint" + else + log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first." + exit 1 + fi +fi + +TS="$(timestamp)" +RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}" +mkdir -p "$RUN_DIR" + +log_header "Agentic Evaluation: $SUITE" +log_info "Model: $MODEL" +log_info "Endpoint: $ENDPOINT" +log_info "Results: $RUN_DIR" + +# Save run metadata +cat > "$RUN_DIR/metadata.json" << ENDJSON +{ + "suite": "$SUITE", + "model": "$MODEL", + "endpoint": "$ENDPOINT", + "timestamp": "$TS", + "hostname": "$(hostname)" +} +ENDJSON + +# ── Start metric logging ──────────────────────────────── +METRICS_FILE="$RUN_DIR/metrics.csv" +bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 & +METRICS_PID=$! +trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT + +# ── Suite execution ────────────────────────────────────── + +run_evalplus() { + local bench="$1" # humaneval or mbpp + log_info "Running EvalPlus $bench..." + local out="$RUN_DIR/evalplus-${bench}.json" + + OPENAI_BASE_URL="$ENDPOINT" evalplus.evaluate \ + --model "$MODEL" \ + --backend openai \ + --dataset "$bench" \ + --greedy \ + 2>&1 | tee "$RUN_DIR/evalplus-${bench}.log" + + # Copy results if generated + local result_dir="$HOME/.evalplus/${MODEL}/${bench}" + if [[ -d "$result_dir" ]]; then + cp -r "$result_dir" "$RUN_DIR/evalplus-${bench}-results/" 2>/dev/null || true + fi + log_success "EvalPlus $bench complete" +} + +run_inspect_eval() { + local eval_name="$1" + local display_name="$2" + log_info "Running Inspect AI: $display_name..." + local out="$RUN_DIR/inspect-${eval_name}.json" + + OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \ + inspect eval "$eval_name" \ + --model "openai/$MODEL" \ + --log-dir "$RUN_DIR/inspect-logs/" \ + 2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log" + + log_success "Inspect $display_name complete" +} + +run_bigcodebench() { + log_info "Running BigCodeBench..." + OPENAI_BASE_URL="$ENDPOINT" bigcodebench.evaluate \ + --model "$MODEL" \ + --backend openai \ + --subset complete \ + 2>&1 | tee "$RUN_DIR/bigcodebench.log" + log_success "BigCodeBench complete" +} + +case "$SUITE" in + quick) + run_evalplus "humaneval" + run_inspect_eval "ifeval" "IFEval (instruction following)" + ;; + code) + run_evalplus "humaneval" + run_evalplus "mbpp" + run_bigcodebench + ;; + tooluse) + run_inspect_eval "bfcl" "BFCL (function calling)" + ;; + full) + run_evalplus "humaneval" + run_evalplus "mbpp" + run_inspect_eval "ifeval" "IFEval (instruction following)" + run_inspect_eval "bfcl" "BFCL (function calling)" + run_bigcodebench + ;; + *) + log_error "Unknown suite: $SUITE. Use: quick, code, tooluse, full" + exit 1 + ;; +esac + +# ── Summary ────────────────────────────────────────────── +log_header "Evaluation Complete" +log_info "Results saved to: $RUN_DIR" +log_info "Contents:" +ls -1 "$RUN_DIR" | sed 's/^/ /' + +# Parse and display results summary +log_header "Results Summary" +for logfile in "$RUN_DIR"/*.log; do + [[ -f "$logfile" ]] || continue + local_name="$(basename "$logfile" .log)" + echo "" + echo " --- $local_name ---" + # Try to extract pass rates from common output formats + grep -iE "(pass@1|accuracy|score|correct|total)" "$logfile" | tail -5 | sed 's/^/ /' +done +echo "" diff --git a/scripts/agentic/setup.sh b/scripts/agentic/setup.sh new file mode 100644 index 0000000..8839129 --- /dev/null +++ b/scripts/agentic/setup.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Setup agentic evaluation tools +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" + +log_header "Agentic Evaluation Setup" + +# ── Python virtual environment ─────────────────────────── +VENV_DIR="$(data_dir venv)" +if [[ ! -f "$VENV_DIR/bin/activate" ]]; then + log_info "Creating Python virtual environment..." + python3 -m venv "$VENV_DIR" + log_success "Virtual environment created at $VENV_DIR" +fi + +source "$VENV_DIR/bin/activate" +log_info "Python: $(python3 --version) from $VENV_DIR" + +# ── Install evaluation frameworks ──────────────────────── + +# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.) +if python3 -c "import inspect_ai" 2>/dev/null; then + log_success "inspect-ai already installed" +else + log_info "Installing inspect-ai (main eval framework)..." + pip install inspect-ai 2>&1 | tail -3 + log_success "inspect-ai installed" +fi + +# EvalPlus — HumanEval+ and MBPP+ with native ollama support +if python3 -c "import evalplus" 2>/dev/null; then + log_success "evalplus already installed" +else + log_info "Installing evalplus (code generation benchmarks)..." + pip install evalplus 2>&1 | tail -3 + log_success "evalplus installed" +fi + +# BigCodeBench +if python3 -c "import bigcodebench" 2>/dev/null; then + log_success "bigcodebench already installed" +else + log_info "Installing bigcodebench..." + pip install bigcodebench 2>&1 | tail -3 + log_success "bigcodebench installed" +fi + +# ── Check for local LLM server ────────────────────────── +log_header "LLM Server Check" + +ollama_ok=false +llamacpp_ok=false + +if is_cmd ollama; then + if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then + log_success "ollama running at localhost:11434" + ollama_ok=true + # List available models + log_info "Available ollama models:" + ollama list 2>/dev/null | head -10 || true + else + log_warn "ollama installed but not running. Start with: ollama serve" + fi +else + log_info "ollama not installed — needed for most agentic benchmarks" + log_info "Install: curl -fsSL https://ollama.com/install.sh | sh" +fi + +# Check for llama.cpp server +if curl -s http://localhost:8080/health >/dev/null 2>&1; then + log_success "llama.cpp server running at localhost:8080" + llamacpp_ok=true +else + log_info "No llama.cpp server detected at localhost:8080" + log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap" +fi + +if ! $ollama_ok && ! $llamacpp_ok; then + log_warn "No local LLM server running. Agentic benchmarks need one." +fi + +# ── Summary ────────────────────────────────────────────── +log_header "Setup Complete" +echo "" +echo " Installed tools:" +echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)" +echo " evalplus — HumanEval+ / MBPP+ with native ollama support" +echo " bigcodebench — 1,140 coding tasks across 139 libraries" +echo "" +echo " To activate the virtual environment:" +echo " source data/venv/bin/activate" +echo "" +echo " Run evaluations:" +echo " make agentic-quick # EvalPlus + IFEval (~1 hour)" +echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)" +echo "" diff --git a/scripts/benchmark/setup.sh b/scripts/benchmark/setup.sh index fb56c49..e9ef44d 100644 --- a/scripts/benchmark/setup.sh +++ b/scripts/benchmark/setup.sh @@ -8,13 +8,13 @@ source "$SCRIPT_DIR/../../lib/detect.sh" TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes" MODEL_DIR="$(data_dir models)" +MODEL_CATALOG="$PROJECT_ROOT/configs/models.conf" log_header "Benchmark Setup" # ── 1. Check toolbox containers ────────────────────────── log_info "Checking toolbox containers..." -# Minimum required: vulkan-radv (most stable) REQUIRED_TOOLBOXES=("llama-vulkan-radv") OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-rocm-7.2" "llama-vulkan-amdvlk") @@ -22,7 +22,7 @@ existing=$(detect_toolbox_names 2>/dev/null || true) missing=() for tb in "${REQUIRED_TOOLBOXES[@]}"; do - if echo "$existing" | grep -q "^${tb}$"; then + if echo "$existing" | grep -qF "$tb"; then log_success "Toolbox: $tb" else missing+=("$tb") @@ -31,7 +31,7 @@ for tb in "${REQUIRED_TOOLBOXES[@]}"; do done for tb in "${OPTIONAL_TOOLBOXES[@]}"; do - if echo "$existing" | grep -q "^${tb}$"; then + if echo "$existing" | grep -qF "$tb"; then log_success "Toolbox: $tb (optional)" else log_info "Toolbox not present: $tb (optional)" @@ -80,26 +80,54 @@ if (( model_count > 0 )); then done else log_warn "No GGUF models found in $MODEL_DIR" - log_info "Download a test model. Example:" +fi + +# ── 4. Show model catalog ─────────────────────────────── +log_header "Model Catalog" +log_info "Available models (from configs/models.conf):" +echo "" +printf " ${BOLD}%-28s %-10s %-8s %s${RESET}\n" "Name" "Category" "Size" "Description" +echo " $(printf '%.0s─' {1..70})" +while IFS='|' read -r name repo file size_gb category desc; do + [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue + local_file="$MODEL_DIR/$file" + status=" " + if [[ -f "$local_file" ]]; then + status="*" + fi + printf " %s %-27s %-10s %4s GB %s\n" "$status" "$name" "$category" "$size_gb" "$desc" +done < "$MODEL_CATALOG" +echo "" +echo " (* = downloaded)" +echo "" + +# ── 5. Offer downloads ────────────────────────────────── +if is_cmd huggingface-cli; then + log_info "Download models with:" echo "" - echo " # Small (4B, ~3 GB):" - echo " huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \\" - echo " --local-dir $MODEL_DIR" + echo " # Recommended starter set:" + echo " # Smoke test (3 GB):" + echo " huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf --local-dir $MODEL_DIR" echo "" - echo " # Medium (14B, ~9 GB):" - echo " huggingface-cli download Qwen/Qwen3-14B-GGUF Qwen3-14B-Q4_K_M.gguf \\" - echo " --local-dir $MODEL_DIR" + echo " # Top pick — Qwen3.5-35B-A3B MoE Q8 (37 GB, ~85 t/s gen):" + echo " huggingface-cli download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q8_0.gguf --local-dir $MODEL_DIR" + echo "" + echo " # Agentic/coding — Qwen3-Coder-30B-A3B (18 GB, best for tool use):" + echo " huggingface-cli download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf --local-dir $MODEL_DIR" + echo "" + echo " # Or download any model from catalog:" + echo " # huggingface-cli download REPO FILE --local-dir $MODEL_DIR" echo "" - if is_cmd huggingface-cli; then - if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as test model?"; then - huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \ + if (( model_count == 0 )); then + if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as smoke test model?"; then + huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \ --local-dir "$MODEL_DIR" log_success "Model downloaded" fi - else - log_info "Install huggingface-cli: pip install huggingface_hub[cli]" fi +else + log_info "Install huggingface-cli: pip install huggingface_hub[cli]" fi log_header "Setup Complete"