Compare commits
6 Commits
38daf953bf
...
dd403a907c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd403a907c | ||
|
|
ba24091791 | ||
|
|
ea70687cd2 | ||
|
|
1549bc27c0 | ||
|
|
f92b710492 | ||
|
|
7531f6fa74 |
10
Makefile
10
Makefile
@@ -38,6 +38,13 @@ benchmark: ## Run full benchmark suite (supports ARGS="--tag NAME --max-size 20"
|
|||||||
benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare BEFORE=dir AFTER=dir)
|
benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare BEFORE=dir AFTER=dir)
|
||||||
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
||||||
|
|
||||||
|
# --- Serve ---
|
||||||
|
serve: ## Launch llama-server with optimized settings (ARGS="-m MODEL.gguf")
|
||||||
|
@bash bin/serve $(ARGS)
|
||||||
|
|
||||||
|
serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
|
||||||
|
@bash bin/serve --ngram $(ARGS)
|
||||||
|
|
||||||
# --- Optimize ---
|
# --- Optimize ---
|
||||||
optimize: ## Interactive optimization walkthrough
|
optimize: ## Interactive optimization walkthrough
|
||||||
@bash bin/optimize --all
|
@bash bin/optimize --all
|
||||||
@@ -48,6 +55,9 @@ optimize-kernel: ## Configure kernel boot parameters
|
|||||||
optimize-tuned: ## Switch to accelerator-performance profile
|
optimize-tuned: ## Switch to accelerator-performance profile
|
||||||
@bash scripts/optimize/tuned-profile.sh
|
@bash scripts/optimize/tuned-profile.sh
|
||||||
|
|
||||||
|
optimize-power: ## Apply Phase 2 tuning (ryzenadj, sysctl, THP, RADV)
|
||||||
|
@bash scripts/optimize/power-profile.sh
|
||||||
|
|
||||||
optimize-vram: ## BIOS VRAM guidance + GTT verification
|
optimize-vram: ## BIOS VRAM guidance + GTT verification
|
||||||
@bash scripts/optimize/vram-gtt.sh
|
@bash scripts/optimize/vram-gtt.sh
|
||||||
|
|
||||||
|
|||||||
@@ -23,10 +23,12 @@ case "${1:-help}" in
|
|||||||
echo " --category LIST Comma-separated: smoke,dense,moe"
|
echo " --category LIST Comma-separated: smoke,dense,moe"
|
||||||
echo " --skip-longctx Skip long-context (32K) tests"
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " benchmark baseline --max-size 20 --skip-longctx"
|
echo " benchmark baseline --max-size 20 --skip-longctx"
|
||||||
echo " benchmark run --tag post-opt --category moe"
|
echo " benchmark run --tag post-opt --category moe"
|
||||||
|
echo " benchmark run --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
13
bin/optimize
13
bin/optimize
@@ -6,24 +6,31 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|||||||
|
|
||||||
case "${1:---all}" in
|
case "${1:---all}" in
|
||||||
--all|-a)
|
--all|-a)
|
||||||
echo "Running optimization walkthrough..."
|
echo "Running full optimization walkthrough..."
|
||||||
|
echo ""
|
||||||
|
echo "=== Phase 1: Core System ==="
|
||||||
bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh"
|
bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh"
|
||||||
bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh"
|
bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh"
|
||||||
bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh"
|
bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh"
|
||||||
echo ""
|
echo ""
|
||||||
|
echo "=== Phase 2: System Tuning ==="
|
||||||
|
bash "$SCRIPT_DIR/scripts/optimize/power-profile.sh"
|
||||||
|
echo ""
|
||||||
bash "$SCRIPT_DIR/scripts/optimize/verify.sh"
|
bash "$SCRIPT_DIR/scripts/optimize/verify.sh"
|
||||||
;;
|
;;
|
||||||
--kernel|-k) exec bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh" ;;
|
--kernel|-k) exec bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh" ;;
|
||||||
--tuned|-t) exec bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh" ;;
|
--tuned|-t) exec bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh" ;;
|
||||||
--vram|-v) exec bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh" ;;
|
--vram|-v) exec bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh" ;;
|
||||||
|
--power|-p) exec bash "$SCRIPT_DIR/scripts/optimize/power-profile.sh" ;;
|
||||||
--verify) exec bash "$SCRIPT_DIR/scripts/optimize/verify.sh" ;;
|
--verify) exec bash "$SCRIPT_DIR/scripts/optimize/verify.sh" ;;
|
||||||
--rollback) exec bash "$SCRIPT_DIR/scripts/optimize/rollback.sh" ;;
|
--rollback) exec bash "$SCRIPT_DIR/scripts/optimize/rollback.sh" ;;
|
||||||
*)
|
*)
|
||||||
echo "Usage: optimize [--all|--kernel|--tuned|--vram|--verify|--rollback]"
|
echo "Usage: optimize [--all|--kernel|--tuned|--vram|--power|--verify|--rollback]"
|
||||||
echo " --all Full optimization walkthrough (default)"
|
echo " --all Full optimization walkthrough (Phase 1 + 2)"
|
||||||
echo " --kernel Configure kernel boot parameters"
|
echo " --kernel Configure kernel boot parameters"
|
||||||
echo " --tuned Switch tuned profile"
|
echo " --tuned Switch tuned profile"
|
||||||
echo " --vram BIOS VRAM + GTT guidance"
|
echo " --vram BIOS VRAM + GTT guidance"
|
||||||
|
echo " --power Phase 2: ryzenadj, sysctl, THP, RADV"
|
||||||
echo " --verify Post-optimization checklist"
|
echo " --verify Post-optimization checklist"
|
||||||
echo " --rollback Revert changes"
|
echo " --rollback Revert changes"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
5
bin/serve
Executable file
5
bin/serve
Executable file
@@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Server dispatcher
|
||||||
|
set -euo pipefail
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
exec bash "$SCRIPT_DIR/scripts/serve/launch.sh" "$@"
|
||||||
@@ -18,6 +18,12 @@ glm-4.7-flash-q6|lmstudio-community/GLM-4.7-Flash-GGUF|GLM-4.7-Flash-Q6_K.gguf|2
|
|||||||
qwen3.5-27b-q4|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q4_K_M.gguf|17|dense|Dense 27B, quality-first
|
qwen3.5-27b-q4|unsloth/Qwen3.5-27B-GGUF|Qwen3.5-27B-Q4_K_M.gguf|17|dense|Dense 27B, quality-first
|
||||||
|
|
||||||
# ── MoE models (fast generation, best for 64GB) ─────────
|
# ── MoE models (fast generation, best for 64GB) ─────────
|
||||||
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_L.gguf|19|moe|MoE 35B, 3B active, Unsloth dynamic
|
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
||||||
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
||||||
nemotron-30b-a3b-q4|lmstudio-community/NVIDIA-Nemotron-3-Nano-30B-A3B-GGUF|NVIDIA-Nemotron-3-Nano-30B-A3B-Q4_K_M.gguf|23|moe|Nemotron MoE 30B, 3B active
|
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
||||||
|
|
||||||
|
# ── Coding models ─────────────────────────────────────────
|
||||||
|
qwen3-coder-30b-a3b-q6|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf|26|moe|Agentic coding MoE, pure Transformer
|
||||||
|
|
||||||
|
# ── Draft models (speculative decoding) ───────────────────
|
||||||
|
qwen3.5-0.8b-q8-draft|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|draft|Draft for Qwen3.5 speculative decoding
|
||||||
|
|||||||
14
configs/ryzenadj-llm.service
Normal file
14
configs/ryzenadj-llm.service
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Apply RyzenAdj power limits for LLM inference
|
||||||
|
After=multi-user.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/ryzenadj --stapm-limit=85000 --fast-limit=85000 --slow-limit=85000 --apu-slow-limit=85000
|
||||||
|
RemainAfterExit=yes
|
||||||
|
|
||||||
|
# Re-apply after resume from sleep/hibernate (HP firmware resets limits)
|
||||||
|
ExecStartPost=/bin/sleep 2
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
10
configs/ryzenadj-resume.service
Normal file
10
configs/ryzenadj-resume.service
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Re-apply RyzenAdj power limits after resume
|
||||||
|
After=suspend.target hibernate.target hybrid-sleep.target suspend-then-hibernate.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/ryzenadj --stapm-limit=85000 --fast-limit=85000 --slow-limit=85000 --apu-slow-limit=85000
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=suspend.target hibernate.target hybrid-sleep.target suspend-then-hibernate.target
|
||||||
825
docs/inference-optimization-landscape.md
Normal file
825
docs/inference-optimization-landscape.md
Normal file
@@ -0,0 +1,825 @@
|
|||||||
|
# LLM Inference Optimization Landscape (March 2026)
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
Comprehensive survey of cutting-edge LLM inference optimization techniques applicable
|
||||||
|
to a high-end AMD APU workstation: Ryzen AI MAX+ 395 (Strix Halo), Radeon 8060S
|
||||||
|
(gfx1151, RDNA 3.5), 64 GB unified LPDDR5X memory, 256 GB/s bandwidth. Covers
|
||||||
|
inference engines, quantization, attention, MoE optimization, memory bandwidth, OS-level
|
||||||
|
tuning, hardware features, and model-level techniques. Research current as of March 2026.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Inference Engines and Backends](#1-inference-engines-and-backends)
|
||||||
|
2. [Advanced Quantization Techniques](#2-advanced-quantization-techniques)
|
||||||
|
3. [Attention Optimization](#3-attention-optimization)
|
||||||
|
4. [MoE-Specific Optimizations](#4-moe-specific-optimizations)
|
||||||
|
5. [Memory Bandwidth Optimization](#5-memory-bandwidth-optimization)
|
||||||
|
6. [OS and Runtime Techniques](#6-os-and-runtime-techniques)
|
||||||
|
7. [Emerging Hardware Features](#7-emerging-hardware-features)
|
||||||
|
8. [Model-Level Optimizations](#8-model-level-optimizations)
|
||||||
|
9. [Prioritized Recommendations for Strix Halo](#9-prioritized-recommendations-for-strix-halo)
|
||||||
|
10. [Sources](#10-sources)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Inference Engines and Backends
|
||||||
|
|
||||||
|
### 1.1 llama.cpp -- Still the Foundation
|
||||||
|
|
||||||
|
llama.cpp remains the dominant local inference engine. All major interfaces (Ollama,
|
||||||
|
LM Studio, GPT4All, KoboldCpp) use it under the hood. For Strix Halo specifically:
|
||||||
|
|
||||||
|
- **ROCm/HIP backend**: Build with `-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1151
|
||||||
|
-DGGML_HIP_ROCWMMA_FATTN=ON -DGGML_HIP_UMA=ON`. The `ROCBLAS_USE_HIPBLASLT=1`
|
||||||
|
environment variable forces hipBLASLt kernels, which deliver the best throughput on
|
||||||
|
gfx1151.
|
||||||
|
- **Vulkan backend**: The RADV Mesa driver has seen active RDNA 3.5/4 optimization
|
||||||
|
in Mesa 25.x. In some benchmarks Vulkan outperforms ROCm for single-shot inference
|
||||||
|
and shorter contexts. HIP+WMMA+FlashAttention is fastest for long contexts (tg8192+).
|
||||||
|
- **UMA detection bug (issue #18159)**: llama.cpp's UMA detection can incorrectly
|
||||||
|
limit available memory on AMD APUs with large TTM allocations. The `--mmp 0`
|
||||||
|
(disable mmap) flag is critical for ROCm on Strix Halo to avoid catastrophically
|
||||||
|
slow model loading.
|
||||||
|
- **Performance**: Llama-2-7B Q4_0 achieves ~1464 t/s prompt processing (pp512) and
|
||||||
|
~50 t/s token generation (tg128) on Strix Halo with ROCm.
|
||||||
|
- **Known regression**: A commit enabling WMMA-MMQ INT kernels for RDNA 3 introduced
|
||||||
|
significant prompt processing regression on gfx1151 with ROCm 7.x (issue #17917).
|
||||||
|
|
||||||
|
**Status**: Production-ready. Best single-engine choice for Strix Halo.
|
||||||
|
|
||||||
|
### 1.2 KTransformers -- CPU/GPU Hybrid MoE Specialist
|
||||||
|
|
||||||
|
KTransformers (SOSP 2025) is the most significant new engine for hybrid inference.
|
||||||
|
It was purpose-built for running large MoE models (DeepSeek-R1/V3) on systems with
|
||||||
|
limited GPU memory but abundant CPU memory.
|
||||||
|
|
||||||
|
- **AMX-optimized kernels**: Uses Intel AMX instructions for CPU-side expert
|
||||||
|
computation. For AMD Zen 5, it falls back to AVX-512, which is still substantially
|
||||||
|
faster than naive CPU inference.
|
||||||
|
- **Async CPU-GPU scheduling**: Overlaps CPU expert computation with GPU attention
|
||||||
|
computation, hiding CPU latency.
|
||||||
|
- **Performance**: 4.62-19.74x prefill speedup, 1.25-4.09x decode speedup vs
|
||||||
|
existing hybrid systems. SGLang + KTransformers achieves 220+ tok/s total
|
||||||
|
throughput on trillion-parameter MoE models.
|
||||||
|
- **Relevance to Strix Halo**: Moderate. KTransformers shines when GPU VRAM is
|
||||||
|
scarce (24 GB discrete) and CPU RAM is abundant (382 GB). On Strix Halo, all 64 GB
|
||||||
|
is accessible to the GPU, so the CPU offloading advantage is diminished. However,
|
||||||
|
for models exceeding 64 GB, KTransformers-style hybrid inference becomes relevant.
|
||||||
|
|
||||||
|
**Status**: Production. Most useful for models that exceed available VRAM.
|
||||||
|
|
||||||
|
### 1.3 PowerInfer / PowerInfer-2
|
||||||
|
|
||||||
|
PowerInfer-2 targets smartphones, achieving 11.68 t/s on Mixtral 47B (22x faster
|
||||||
|
than alternatives). It exploits MoE sparsity by predicting which experts will
|
||||||
|
activate and only loading those. The core technique -- hot/cold neuron partitioning
|
||||||
|
and GPU-resident hot neurons -- is architecturally interesting but the implementation
|
||||||
|
targets mobile SoCs with discrete memory hierarchies, not unified-memory APUs where
|
||||||
|
all memory is equally accessible to the GPU.
|
||||||
|
|
||||||
|
**Status**: Research. Techniques are partially subsumed by llama.cpp's own MoE
|
||||||
|
offloading improvements.
|
||||||
|
|
||||||
|
### 1.4 MLC-LLM
|
||||||
|
|
||||||
|
MLC-LLM compiles models via TVM to target multiple backends including ROCm, Vulkan,
|
||||||
|
Metal, and OpenCL. It was one of the first engines to make AMD GPUs competitive for
|
||||||
|
LLM inference (2023 blog post). The Vulkan backend provides a universal fallback
|
||||||
|
that works on any GPU.
|
||||||
|
|
||||||
|
**Status**: Active but niche. For Strix Halo, llama.cpp's native ROCm/Vulkan
|
||||||
|
backends are more mature and better optimized.
|
||||||
|
|
||||||
|
### 1.5 mistral.rs / candle / burn
|
||||||
|
|
||||||
|
Rust-based inference engines:
|
||||||
|
|
||||||
|
- **mistral.rs**: Built on Hugging Face's candle library. Supports GGUF, GPTQ,
|
||||||
|
ISQ (in-situ quantization). Has CUDA support but no ROCm backend.
|
||||||
|
- **candle**: Hugging Face's Rust ML framework. GPU support via CUDA; no ROCm.
|
||||||
|
- **burn**: Rust ML framework with multiple backends (WGPU, Vulkan, CUDA). The
|
||||||
|
WGPU/Vulkan path could theoretically work on AMD, but LLM inference support
|
||||||
|
is limited.
|
||||||
|
|
||||||
|
**Status**: Not viable for Strix Halo in 2026. No ROCm support, and the Vulkan
|
||||||
|
paths are less optimized than llama.cpp's.
|
||||||
|
|
||||||
|
### 1.6 BitNet.cpp
|
||||||
|
|
||||||
|
Microsoft's official 1-bit LLM inference framework. Achieves 6x faster inference
|
||||||
|
and 82% lower energy consumption. GPU kernel support was added May 2025 for NVIDIA
|
||||||
|
and Apple Silicon. No AMD GPU kernels yet. CPU-only mode works on any x86 system
|
||||||
|
and could be relevant for future 1-bit models, but the model ecosystem (BitNet b1.58
|
||||||
|
variants) remains small.
|
||||||
|
|
||||||
|
**Status**: Watch. No AMD GPU support. CPU path works but model selection is limited.
|
||||||
|
|
||||||
|
### 1.7 vLLM and SGLang
|
||||||
|
|
||||||
|
Both are production LLM serving frameworks with AMD ROCm support:
|
||||||
|
|
||||||
|
- **vLLM v0.16.0** (Feb 2026): ROCm is now a first-class platform. 93% of AMD
|
||||||
|
test groups passing. Native AITER FP8 kernels, fused LayerNorm/SiLU, optimized
|
||||||
|
Paged Attention. Extended bitsandbytes quantization to warp-size-32 GPUs (RDNA).
|
||||||
|
- **SGLang**: Supports ROCm. KTransformers integration for hybrid MoE inference.
|
||||||
|
|
||||||
|
Both are overkill for single-user local inference but become relevant for serving
|
||||||
|
multiple users or running agentic workloads with concurrent requests.
|
||||||
|
|
||||||
|
**Status**: Production for server workloads. Consider if running multi-user or
|
||||||
|
agentic eval pipelines.
|
||||||
|
|
||||||
|
### 1.8 ExLlamaV3 / EXL3
|
||||||
|
|
||||||
|
ExLlamaV3 introduces the EXL3 format (based on QTIP from Cornell RelaxML), achieving
|
||||||
|
excellent perplexity at extreme compression (Llama 3.3 70B at 1.75 bpw, 19 GB). The
|
||||||
|
Marlin-inspired GEMM kernels are highly optimized for NVIDIA GPUs. AMD ROCm support
|
||||||
|
was absent at launch (early 2025) and current status is uncertain.
|
||||||
|
|
||||||
|
**Status**: Watch. Potentially best-in-class quantization quality, but AMD support
|
||||||
|
is unclear.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Advanced Quantization Techniques
|
||||||
|
|
||||||
|
### 2.1 GGUF Quantization Landscape
|
||||||
|
|
||||||
|
GGUF remains the dominant format for local inference via llama.cpp. The key variants:
|
||||||
|
|
||||||
|
| Format | Bits | Method | Best For |
|
||||||
|
|-----------|------|-----------------|-----------------------------|
|
||||||
|
| Q8_0 | 8 | Round-to-nearest| Maximum quality, 2x compress|
|
||||||
|
| Q6_K | 6.5 | K-quant | High quality, 2.5x compress |
|
||||||
|
| Q5_K_M | 5.5 | K-quant+imatrix | Balanced quality/size |
|
||||||
|
| Q4_K_M | 4.5 | K-quant+imatrix | Default recommendation |
|
||||||
|
| Q3_K_M | 3.9 | K-quant+imatrix | Aggressive, still usable |
|
||||||
|
| IQ3_XXS | 3.06 | I-quant+imatrix | Extreme compression |
|
||||||
|
| IQ2_XXS | 2.06 | I-quant+imatrix | Near-minimum viable |
|
||||||
|
| IQ1_S | 1.56 | I-quant+imatrix | Experimental |
|
||||||
|
|
||||||
|
**imatrix (Importance Matrix)**: The single most impactful quality improvement for
|
||||||
|
sub-4-bit quantization. The importance matrix identifies which weights produce large
|
||||||
|
activations during inference and allocates more precision to them. For aggressive
|
||||||
|
quantization (<4 bits), imatrix is no longer optional -- it is essential.
|
||||||
|
|
||||||
|
**Recommendation**: Q4_K_M + imatrix for most use cases. Q3_K_M + imatrix when
|
||||||
|
fitting a larger model matters more than marginal quality.
|
||||||
|
|
||||||
|
### 2.2 Unsloth Dynamic 2.0
|
||||||
|
|
||||||
|
Unsloth Dynamic 2.0 (Feb 2026) represents the state-of-the-art in intelligent GGUF
|
||||||
|
quantization:
|
||||||
|
|
||||||
|
- **Per-layer adaptive quantization**: Each layer gets a custom quantization type
|
||||||
|
based on sensitivity analysis. The quantization scheme for Gemma 3 differs
|
||||||
|
significantly from Llama 4.
|
||||||
|
- **Universal MoE + dense support**: Dynamic 2.0 works on all architectures
|
||||||
|
(previously MoE-only).
|
||||||
|
- **Calibration dataset**: 1.5M+ token hand-curated dataset for improved
|
||||||
|
conversational quality.
|
||||||
|
- **Quality results**: Dynamic 3-bit DeepSeek V3.1 GGUF scores 75.6% on 5-shot
|
||||||
|
MMLU, surpassing many full-precision models.
|
||||||
|
- **KL Divergence tracking**: Every GGUF is benchmarked against the original model
|
||||||
|
on both perplexity and KL divergence.
|
||||||
|
|
||||||
|
**Relevance**: Directly applicable. Use Unsloth Dynamic 2.0 GGUFs when available
|
||||||
|
for any model. They consistently outperform standard k-quant GGUFs at the same
|
||||||
|
bit-width.
|
||||||
|
|
||||||
|
### 2.3 AQLM and QuIP#
|
||||||
|
|
||||||
|
Both target extreme compression (2-3 bits):
|
||||||
|
|
||||||
|
- **QuIP#** (ICML 2024): Uses randomized Hadamard transforms + E8 lattice codebooks.
|
||||||
|
First PTQ method where 3-bit outperforms theoretical lossless 4-bit. The E8
|
||||||
|
codebook fits in L1 cache, enabling inference speedups over FP16.
|
||||||
|
- **AQLM** v1.1.7 (April 2025): Additive quantization achieving Pareto optimality
|
||||||
|
below 3 bpw. Outperforms QuIP# on MoE models at 2-bit. Added arbitrary
|
||||||
|
8-dimensional codebooks on GPU.
|
||||||
|
|
||||||
|
Both require PyTorch/CUDA for dequantization kernels. Neither has native llama.cpp
|
||||||
|
integration or AMD support. They represent the theoretical frontier of what is
|
||||||
|
achievable at extreme compression but are not practical for Strix Halo today.
|
||||||
|
|
||||||
|
**Status**: Research. Watch for llama.cpp integration of QTIP (via ExLlamaV3/EXL3).
|
||||||
|
|
||||||
|
### 2.4 AWQ vs GPTQ vs GGUF on AMD
|
||||||
|
|
||||||
|
For AMD GPUs in the llama.cpp ecosystem:
|
||||||
|
|
||||||
|
- **GGUF**: The only practical choice. Native llama.cpp support with ROCm/Vulkan
|
||||||
|
acceleration. K-quants and I-quants are well-optimized.
|
||||||
|
- **AWQ/GPTQ**: Require Marlin kernels for competitive speed (741 tok/s with
|
||||||
|
Marlin-AWQ vs 67 tok/s without on NVIDIA). Marlin kernels are CUDA-only. On AMD,
|
||||||
|
these formats are accessible via vLLM or Hugging Face Transformers with ROCm, but
|
||||||
|
not through llama.cpp.
|
||||||
|
- **Performance hierarchy on AMD (via vLLM)**: GPTQ and AWQ with Marlin kernels are
|
||||||
|
fastest on NVIDIA; on AMD ROCm, the performance advantage over GGUF is minimal
|
||||||
|
and setup complexity is higher.
|
||||||
|
|
||||||
|
**Recommendation**: GGUF for llama.cpp on Strix Halo. AWQ/GPTQ only if using vLLM.
|
||||||
|
|
||||||
|
### 2.5 Mixed-Precision and Layer-Wise Quantization
|
||||||
|
|
||||||
|
Active research area with direct practical implications:
|
||||||
|
|
||||||
|
- **Attention vs FFN sensitivity**: Attention layers (QKV projections, output
|
||||||
|
projection) have varying sensitivity. FFN layers are often the largest component
|
||||||
|
and frequent targets for aggressive quantization (INT4).
|
||||||
|
- **Channel-Wise Mixed-Precision (CMPQ)**: Allocates quantization precision per
|
||||||
|
weight channel based on activation distributions. Adapts to any bit-width.
|
||||||
|
- **HOBBIT for MoE**: Maintains FP16 and INT4 versions of experts simultaneously.
|
||||||
|
Hot experts stay at FP16; cold experts use INT4 or even INT2. This concept is
|
||||||
|
partially implemented in Unsloth Dynamic 2.0's per-layer approach.
|
||||||
|
- **Fine-Grained Mixed Precision (FGMP)**: Goes below row-level granularity to
|
||||||
|
handle unstructured sensitivity patterns in both weights and activations.
|
||||||
|
|
||||||
|
**Relevance**: Unsloth Dynamic 2.0 already implements the practical version of
|
||||||
|
layer-wise mixed precision for GGUF. The research frontier is moving toward
|
||||||
|
sub-layer and channel-level mixed precision.
|
||||||
|
|
||||||
|
### 2.6 KV Cache Quantization
|
||||||
|
|
||||||
|
- **TurboQuant** (ICLR 2026): Being integrated into llama.cpp. TQ3 (3-bit) achieves
|
||||||
|
4.9x compression vs FP16 KV cache; TQ4 (4-bit) achieves 3.8x. This directly
|
||||||
|
reduces memory pressure for long-context inference.
|
||||||
|
- **llama.cpp native**: Already supports Q8_0 and Q4_0 KV cache quantization via
|
||||||
|
`--cache-type-k` and `--cache-type-v` flags.
|
||||||
|
|
||||||
|
**Relevance**: High. On a 64 GB system, KV cache can consume significant memory for
|
||||||
|
long contexts. Q4_0 KV cache is recommended; TurboQuant will push this further.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Attention Optimization
|
||||||
|
|
||||||
|
### 3.1 Flash Attention on AMD
|
||||||
|
|
||||||
|
Current status for RDNA 3.5 / gfx1151:
|
||||||
|
|
||||||
|
- **Triton backend**: Supports CDNA and RDNA GPUs with fp16, bf16, fp32. This is
|
||||||
|
the primary Flash Attention path for non-Instinct AMD GPUs.
|
||||||
|
- **PyTorch integration**: Since PyTorch 2.5.0+, `F.scaled_dot_product_attention`
|
||||||
|
automatically uses Flash Attention on RDNA cards via the Triton backend.
|
||||||
|
- **llama.cpp WMMA Flash Attention**: Enabled via `-DGGML_HIP_ROCWMMA_FATTN=ON`.
|
||||||
|
Uses RDNA 3.5's WMMA instructions for matrix multiply within the attention kernel.
|
||||||
|
This is the fastest path for long-context inference on Strix Halo.
|
||||||
|
- **CK (Composable Kernel) backend**: Supports MI200x, MI250x, MI300x, MI355x.
|
||||||
|
Not available for RDNA consumer GPUs.
|
||||||
|
|
||||||
|
**Gap**: Flash Attention 3 (with asynchronous pipelines and FP8 attention) is
|
||||||
|
NVIDIA Hopper-specific. No AMD equivalent exists.
|
||||||
|
|
||||||
|
### 3.2 SageAttention
|
||||||
|
|
||||||
|
SageAttention (ICLR 2025, ICML 2025, NeurIPS 2025 Spotlight) achieves 2-5x speedup
|
||||||
|
over FlashAttention through quantized attention (8-bit Q/K matrices, FP16 values).
|
||||||
|
SageAttention3 further uses FP4 Tensor Cores on Blackwell GPUs.
|
||||||
|
|
||||||
|
**AMD status**: SageAttention's Triton implementation could theoretically work on
|
||||||
|
AMD GPUs, but no AMD-optimized kernels exist. The quantized attention concept is
|
||||||
|
sound and could be adapted.
|
||||||
|
|
||||||
|
**Status**: Watch. Would be high-impact if ported to AMD.
|
||||||
|
|
||||||
|
### 3.3 Paged Attention
|
||||||
|
|
||||||
|
Paged Attention (vLLM) manages KV cache as non-contiguous memory pages, eliminating
|
||||||
|
60-80% of memory waste from fragmentation. llama.cpp's server mode implements a
|
||||||
|
simplified version of this for concurrent request handling, but the full PagedAttention
|
||||||
|
system is more mature in vLLM.
|
||||||
|
|
||||||
|
**Relevance**: Moderate for single-user. High for multi-user serving.
|
||||||
|
|
||||||
|
### 3.4 GQA/MQA Architecture Implications
|
||||||
|
|
||||||
|
Modern models (Llama 2/3, Mistral, Qwen) use Grouped Query Attention:
|
||||||
|
|
||||||
|
- GQA reduces KV cache by up to 90% vs MHA (Multi-Head Attention)
|
||||||
|
- 30-40% faster inference than MHA with near-equivalent accuracy
|
||||||
|
- Enables larger batch sizes due to smaller memory footprint
|
||||||
|
|
||||||
|
**Practical impact**: When choosing models for Strix Halo, prefer GQA models. All
|
||||||
|
modern model families (Llama 3, Qwen 3, Gemma 3, Mistral) use GQA. Avoid older MHA
|
||||||
|
models when alternatives exist.
|
||||||
|
|
||||||
|
### 3.5 Ring Attention and Linear Attention
|
||||||
|
|
||||||
|
- **Ring Attention**: Distributes long sequences across multiple devices. Achieves
|
||||||
|
1M context prefill in 77s with 93% parallelization efficiency. Not applicable to
|
||||||
|
single-device Strix Halo.
|
||||||
|
- **Linear Attention**: Reduces KV cache from O(n) to O(1) and computation from
|
||||||
|
O(n^2) to O(n). The Ring-Linear models (hybrid softmax + linear attention) reduce
|
||||||
|
inference cost to 1/10 of dense models. This is a model architecture choice, not
|
||||||
|
a runtime optimization.
|
||||||
|
|
||||||
|
**Relevance**: Linear attention models would be transformative for long-context on
|
||||||
|
Strix Halo. Watch for Qwen, DeepSeek, or Llama variants with hybrid attention.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. MoE-Specific Optimizations
|
||||||
|
|
||||||
|
### 4.1 Expert Offloading on Unified Memory
|
||||||
|
|
||||||
|
On discrete GPU systems, MoE inference involves expensive PCIe transfers of expert
|
||||||
|
weights between CPU RAM and GPU VRAM. On Strix Halo's unified memory, this bottleneck
|
||||||
|
is fundamentally different:
|
||||||
|
|
||||||
|
- All expert weights reside in the same physical memory accessible to both CPU and
|
||||||
|
GPU. There is no PCIe transfer cost.
|
||||||
|
- The bottleneck shifts to **memory bandwidth**: at 256 GB/s, loading a 2 GB expert
|
||||||
|
takes ~7.8 ms. With GGUF Q4 quantization, experts are 4x smaller, reducing this
|
||||||
|
to ~2 ms.
|
||||||
|
- **Implication**: Unified memory eliminates the offloading problem but does not
|
||||||
|
eliminate the bandwidth problem. The optimization focus should be on reducing the
|
||||||
|
number of expert weights that must be read per token.
|
||||||
|
|
||||||
|
### 4.2 Expert Caching and Prediction
|
||||||
|
|
||||||
|
The research frontier in 2025-2026 focuses on predicting which experts will be needed:
|
||||||
|
|
||||||
|
- **OD-MoE**: 99.94% expert activation prediction accuracy, delivering ~75% of
|
||||||
|
fully GPU-cached speed using 1/3 GPU memory.
|
||||||
|
- **MoE-SpeQ**: Uses a small draft model to predict expert sequences, enabling
|
||||||
|
prefetching. Combines speculative decoding with expert prediction.
|
||||||
|
- **SP-MoE**: First speculative-decoding-aware expert offloading framework. Achieves
|
||||||
|
1.07-3.5x TPOT speedup by exploiting structural correspondence between draft
|
||||||
|
and target models.
|
||||||
|
- **SliceMoE**: Dynamic Bit-Sliced Caching -- caches experts at sub-expert
|
||||||
|
granularity, assigning precision on demand.
|
||||||
|
- **FlashMoE**: ML-based cache replacement for SSD-based expert offloading on edge.
|
||||||
|
|
||||||
|
**Relevance for Strix Halo**: Expert caching is less critical when all experts fit
|
||||||
|
in memory, but expert prediction can still help by enabling **prefetching into L2/
|
||||||
|
Infinity Cache** before the expert is needed, reducing effective memory latency.
|
||||||
|
|
||||||
|
### 4.3 Expert Pruning
|
||||||
|
|
||||||
|
- Static pruning: Remove least-used experts entirely (MC-SMoE, EEP). Can reduce
|
||||||
|
active parameters by up to 96.875% (TSEP). Requires fine-tuning.
|
||||||
|
- Dynamic pruning: Skip experts below an activation threshold at inference time.
|
||||||
|
38.2% FLOPs reduction with 1.32x speedup (Li et al.).
|
||||||
|
- **DynMoE**: 9% FLOPs reduction, 1.37x speedup through dynamic gating.
|
||||||
|
|
||||||
|
**Relevance**: Moderate. Dynamic expert skipping could reduce memory bandwidth
|
||||||
|
requirements on Strix Halo, but requires model-specific configuration.
|
||||||
|
|
||||||
|
### 4.4 MoE Quantization -- Inactive Expert Compression
|
||||||
|
|
||||||
|
HOBBIT maintains multiple precision versions of experts: FP16 hot experts, INT4 cold
|
||||||
|
experts, INT2 for rarely-used experts. On unified memory, a variant of this approach
|
||||||
|
could keep the working set of experts at higher precision while storing rarely-activated
|
||||||
|
experts at aggressive quantization, reducing total memory footprint.
|
||||||
|
|
||||||
|
MoE-CSP achieves 26x speedup through 4-bit/8-bit quantization with custom CUDA
|
||||||
|
kernels. QMoE achieves 20x memory reduction but lacks efficient 1-bit kernel support.
|
||||||
|
|
||||||
|
**Practical approach for Strix Halo**: Use Unsloth Dynamic 2.0 GGUFs, which already
|
||||||
|
implement per-layer (including per-expert) precision allocation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Memory Bandwidth Optimization
|
||||||
|
|
||||||
|
### 5.1 The Fundamental Bottleneck
|
||||||
|
|
||||||
|
LLM inference (especially token generation / decode) is almost always memory-bandwidth
|
||||||
|
bound. On Strix Halo:
|
||||||
|
|
||||||
|
- **Available bandwidth**: 256 GB/s (LPDDR5X-8000, 256-bit bus)
|
||||||
|
- **Theoretical decode throughput** for a 7B Q4_0 model (~3.5 GB):
|
||||||
|
256 GB/s / 3.5 GB = ~73 tok/s (assuming 100% utilization)
|
||||||
|
- **Measured**: ~50 t/s (tg128), implying ~68% bandwidth utilization
|
||||||
|
- **Infinity Cache effect**: The 32 MB Infinity Cache acts as a bandwidth amplifier.
|
||||||
|
When working set fits in cache, effective bandwidth can exceed 256 GB/s. For LLM
|
||||||
|
inference, per-layer weights typically exceed 32 MB, so cache benefit is limited
|
||||||
|
to KV cache and activations.
|
||||||
|
|
||||||
|
### 5.2 Techniques to Reduce Bandwidth Requirements
|
||||||
|
|
||||||
|
| Technique | Bandwidth Reduction | Status on Strix Halo |
|
||||||
|
|----------------------------|--------------------|-----------------------|
|
||||||
|
| Lower quantization (Q4->Q3)| ~25% | Available now |
|
||||||
|
| KV cache quantization (Q4) | ~75% for KV reads | Available now |
|
||||||
|
| Speculative decoding | 2-3x effective | Available now |
|
||||||
|
| Expert prediction/caching | Variable (MoE) | Research |
|
||||||
|
| Weight compression (EXL3) | Up to 8x | No AMD support |
|
||||||
|
| Activation checkpointing | Reduces peak memory | Available |
|
||||||
|
|
||||||
|
### 5.3 Speculative Decoding
|
||||||
|
|
||||||
|
The most impactful bandwidth optimization technique available today:
|
||||||
|
|
||||||
|
- **Principle**: A small, fast "draft" model generates N candidate tokens. The large
|
||||||
|
"target" model verifies all N tokens in a single forward pass (batch). Accepted
|
||||||
|
tokens are "free" -- they required no additional bandwidth from the target model.
|
||||||
|
- **Speedup**: 2-3x without accuracy loss. NVIDIA demonstrates 3.6x on H200.
|
||||||
|
- **EAGLE-3**: Lightweight autoregressive head attached to target model internals.
|
||||||
|
No separate draft model needed.
|
||||||
|
- **TurboSpec**: Closed-loop control system that dynamically adjusts speculative
|
||||||
|
parameters based on online feedback.
|
||||||
|
- **MoE-SpeQ**: Combines speculative decoding with expert prefetching.
|
||||||
|
|
||||||
|
**Relevance**: High. Speculative decoding is the single highest-impact optimization
|
||||||
|
for decode throughput on bandwidth-limited systems like Strix Halo. llama.cpp
|
||||||
|
supports speculative decoding via `--model-draft`.
|
||||||
|
|
||||||
|
### 5.4 Prefetching Strategies
|
||||||
|
|
||||||
|
- **L2 cache prefetching**: Proactively load KV cache and next-layer weights into
|
||||||
|
GPU L2 during computation. Achieves 2.15x attention kernel speedup on NVIDIA H20.
|
||||||
|
- **PRESERVE**: Prefetch model weights from HBM to on-chip cache during communication
|
||||||
|
operations. Up to 1.6x end-to-end speedup.
|
||||||
|
- **Strix Halo consideration**: The 32 MB Infinity Cache + 2 MB L2 provides limited
|
||||||
|
on-chip storage. Prefetching activations and KV cache (which are smaller than
|
||||||
|
weights) into Infinity Cache during weight reads could help.
|
||||||
|
|
||||||
|
### 5.5 Batched Inference
|
||||||
|
|
||||||
|
Batching amortizes weight-read cost across multiple requests:
|
||||||
|
|
||||||
|
- Single request: ~68% bandwidth utilization on Strix Halo
|
||||||
|
- Batch of 4: Approaches compute-bound regime for prefill; still bandwidth-bound
|
||||||
|
for decode on most models
|
||||||
|
- **Continuous batching** (vLLM, llama.cpp server): 10-20x throughput improvement
|
||||||
|
over naive batching
|
||||||
|
|
||||||
|
**Trade-off**: Batching increases throughput but also increases per-request latency
|
||||||
|
and memory consumption (KV cache scales linearly with batch size).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. OS and Runtime Techniques
|
||||||
|
|
||||||
|
### 6.1 Memory Management
|
||||||
|
|
||||||
|
**Huge Pages**: Transparent Huge Pages (THP) reduce TLB misses for large model
|
||||||
|
weights. On Fedora 43, THP is enabled by default. For explicit control:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check current THP setting
|
||||||
|
cat /sys/kernel/mm/transparent_hugepage/enabled
|
||||||
|
|
||||||
|
# For llama.cpp, ensure THP is at least "madvise"
|
||||||
|
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
|
||||||
|
```
|
||||||
|
|
||||||
|
For models loaded with mmap, THP automatically promotes 4 KB pages to 2 MB pages,
|
||||||
|
reducing page faults during inference.
|
||||||
|
|
||||||
|
**Memory Locking**: `mlock` prevents model weights from being swapped. llama.cpp's
|
||||||
|
`--mlock` flag enables this. Critical for systems running other workloads alongside
|
||||||
|
inference.
|
||||||
|
|
||||||
|
**mmap vs direct load**: On Strix Halo with ROCm, `--mmp 0` (disable mmap) is
|
||||||
|
recommended. mmap causes catastrophically slow model loading when GPU offloading is
|
||||||
|
active because of the double-copy path through page cache.
|
||||||
|
|
||||||
|
### 6.2 Process Pinning and NUMA
|
||||||
|
|
||||||
|
Strix Halo is a single-die APU, so NUMA topology is simple (typically 1 NUMA node).
|
||||||
|
However, CPU core affinity still matters:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pin inference to specific cores, keeping others free for OS
|
||||||
|
numactl --physcpubind=0-15 llama-server [args]
|
||||||
|
|
||||||
|
# Or via taskset
|
||||||
|
taskset -c 0-15 llama-server [args]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Core isolation**: For minimum-jitter inference:
|
||||||
|
```bash
|
||||||
|
# Add to kernel cmdline
|
||||||
|
isolcpus=0-15 nohz_full=0-15 rcu_nocbs=0-15
|
||||||
|
```
|
||||||
|
This prevents the OS from scheduling unrelated tasks on inference cores.
|
||||||
|
|
||||||
|
### 6.3 CPU Frequency and Power
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set performance governor for consistent throughput
|
||||||
|
sudo cpupower frequency-set -g performance
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
cpupower frequency-info | grep "current CPU frequency"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.4 cgroups v2 for Resource Isolation
|
||||||
|
|
||||||
|
Reserve memory and CPU for inference workloads:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create inference cgroup
|
||||||
|
sudo mkdir /sys/fs/cgroup/inference
|
||||||
|
echo "+memory +cpu" | sudo tee /sys/fs/cgroup/inference/cgroup.subtree_control
|
||||||
|
|
||||||
|
# Reserve 56 GB for inference (leave 8 GB for system)
|
||||||
|
echo $((56 * 1024 * 1024 * 1024)) | sudo tee /sys/fs/cgroup/inference/memory.min
|
||||||
|
|
||||||
|
# Pin CPU cores
|
||||||
|
echo "0-15" | sudo tee /sys/fs/cgroup/inference/cpuset.cpus
|
||||||
|
|
||||||
|
# Run inference in cgroup
|
||||||
|
sudo cgexec -g memory,cpu:inference llama-server [args]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.5 io_uring for Model Loading
|
||||||
|
|
||||||
|
io_uring provides zero-copy, kernel-bypassing I/O that can accelerate initial model
|
||||||
|
loading. While llama.cpp does not natively use io_uring, the underlying mmap/read
|
||||||
|
path can benefit from io_uring-based file I/O when loading from NVMe:
|
||||||
|
|
||||||
|
- Eliminates context switch overhead during model load
|
||||||
|
- Enables true async I/O with completion ring buffers
|
||||||
|
- Most benefit when loading very large models (>32 GB) from storage
|
||||||
|
|
||||||
|
**Practical impact**: Minor for Strix Halo since model loading is a one-time cost,
|
||||||
|
and LPDDR5X bandwidth far exceeds NVMe read speeds.
|
||||||
|
|
||||||
|
### 6.6 eBPF-Based Performance Monitoring
|
||||||
|
|
||||||
|
eBPF enables zero-instrumentation monitoring of inference workloads:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Monitor GPU DRM scheduler jobs (works with amdgpu driver)
|
||||||
|
sudo bpftrace -e 'tracepoint:drm:drm_sched_job { printf("GPU job: %s\n", args->name); }'
|
||||||
|
|
||||||
|
# Track page faults during model loading
|
||||||
|
sudo bpftrace -e 'tracepoint:exceptions:page_fault_user { @[comm] = count(); }'
|
||||||
|
|
||||||
|
# Monitor context switches on inference cores
|
||||||
|
sudo bpftrace -e 'tracepoint:sched:sched_switch /cpu == 0/ { @[args->next_comm] = count(); }'
|
||||||
|
```
|
||||||
|
|
||||||
|
The eunomia project provides ready-made eBPF programs for AI workload monitoring.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Emerging Hardware Features
|
||||||
|
|
||||||
|
### 7.1 AMD XDNA NPU
|
||||||
|
|
||||||
|
The Ryzen AI MAX+ 395 includes an XDNA 2 NPU rated at 50 TOPS. Current status for
|
||||||
|
LLM inference:
|
||||||
|
|
||||||
|
- **Software stack**: AMD Ryzen AI Software supports ONNX model execution on the NPU.
|
||||||
|
AMD Quark provides quantization for NPU deployment (SmoothQuant, GPTQ, Quarot).
|
||||||
|
- **LLM capability**: The NPU can accelerate small models and specific operations
|
||||||
|
(attention heads, small expert networks) but cannot run full large LLMs.
|
||||||
|
- **Linux support**: Kernel 7.1 (expected 2026) brings significant XDNA upstreaming.
|
||||||
|
Current Linux support is limited compared to Windows.
|
||||||
|
- **Practical use**: The NPU could potentially handle a speculative decoding draft
|
||||||
|
model while the GPU runs the main model. This is not yet implemented in any
|
||||||
|
inference engine.
|
||||||
|
|
||||||
|
**Status**: Not viable for LLM inference in March 2026. Watch for Linux kernel 7.1
|
||||||
|
and llama.cpp NPU backend development.
|
||||||
|
|
||||||
|
### 7.2 RDNA 3.5 Matrix Cores (WMMA)
|
||||||
|
|
||||||
|
The Radeon 8060S (gfx1151) has the same WMMA instruction set as RDNA 3 (gfx11xx),
|
||||||
|
which is a generation behind RDNA 4 (gfx12xx):
|
||||||
|
|
||||||
|
**RDNA 3 / 3.5 (gfx1151) WMMA capabilities**:
|
||||||
|
- FP16/BF16: 512 FLOPS/clock/CU
|
||||||
|
- INT8: 1024 OPS/clock/CU
|
||||||
|
- 16x16 matrix dimensions
|
||||||
|
- Requires inter-lane data shuffling for chained operations
|
||||||
|
|
||||||
|
**RDNA 4 (gfx12xx) improvements over RDNA 3.5**:
|
||||||
|
- FP16/BF16: 1024 FLOPS/clock/CU (2x)
|
||||||
|
- INT8: 2048 OPS/clock/CU (2x)
|
||||||
|
- New FP8/BF8 formats at 4x the FP16 rate
|
||||||
|
- 4:2 structured sparsity support (effectively 2x more)
|
||||||
|
- No inter-lane shuffling needed for chained WMMA (major efficiency gain)
|
||||||
|
- New efficient matrix load instruction
|
||||||
|
|
||||||
|
**Current usage in llama.cpp**: WMMA is used for Flash Attention
|
||||||
|
(`GGML_HIP_ROCWMMA_FATTN`) and matrix-multiply quantized (`MMQ`) kernels. The
|
||||||
|
ROCm 7.x regression for gfx1151 (issue #17917) specifically affects MMQ kernels.
|
||||||
|
|
||||||
|
### 7.3 Vulkan Cooperative Matrices
|
||||||
|
|
||||||
|
The `VK_KHR_cooperative_matrix` Vulkan extension was merged into the RADV driver
|
||||||
|
for RDNA 3+ hardware. This provides a portable API for matrix operations that maps
|
||||||
|
to WMMA hardware:
|
||||||
|
|
||||||
|
- Enables inference engines to use matrix cores through Vulkan instead of
|
||||||
|
vendor-specific ROCm/HIP APIs
|
||||||
|
- llama.cpp's Vulkan backend could leverage this for WMMA-accelerated matrix
|
||||||
|
operations
|
||||||
|
- Currently less optimized than native HIP/ROCm paths
|
||||||
|
|
||||||
|
**Status**: Available in Mesa 25.x. Watch for llama.cpp Vulkan backend improvements
|
||||||
|
using cooperative matrices.
|
||||||
|
|
||||||
|
### 7.4 Infinity Cache for Inference
|
||||||
|
|
||||||
|
Strix Halo has a 32 MB Infinity Cache (MALL -- Memory Attached Last Level):
|
||||||
|
|
||||||
|
- **Architecture**: L1 (256 KB/shader array) -> L2 (2 MB) -> Infinity Cache (32 MB)
|
||||||
|
-> LPDDR5X
|
||||||
|
- **Latency**: Slightly higher than discrete GPU Infinity Cache implementations
|
||||||
|
- **Hit rate**: Varies by workload. Graphics benchmarks show ~73% hit rate at peak.
|
||||||
|
- **LLM inference implications**: For a 7B Q4 model (~3.5 GB), per-layer weights
|
||||||
|
are ~70-140 MB, far exceeding the 32 MB cache. Benefit is limited to:
|
||||||
|
- KV cache for current context (fits well for shorter contexts)
|
||||||
|
- Activations and intermediate results
|
||||||
|
- Embedding layer (often accessed repeatedly)
|
||||||
|
- Small models/layers that fit entirely in cache
|
||||||
|
|
||||||
|
The Infinity Cache is most impactful as a bandwidth amplifier -- when inference
|
||||||
|
accesses exhibit temporal locality (same data accessed multiple times within a
|
||||||
|
short window), effective bandwidth exceeds the 256 GB/s DRAM limit.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Model-Level Optimizations
|
||||||
|
|
||||||
|
### 8.1 Prompt Compression
|
||||||
|
|
||||||
|
- **LLMLingua / LLMLingua-2** (Microsoft): Compresses input prompts by removing
|
||||||
|
low-information tokens. 20x compression with 1.5 point performance drop.
|
||||||
|
1.7-5.7x end-to-end inference speedup. LLMLingua-2 is 3-6x faster than v1.
|
||||||
|
Integrated into LangChain and LlamaIndex.
|
||||||
|
- **500xCompressor**: Compresses contexts into a single special token. 6x-480x
|
||||||
|
compression. Adds only 0.25% parameters. More aggressive but less mature.
|
||||||
|
|
||||||
|
**Relevance**: High for RAG and agentic workloads where prompts are long. Reduces
|
||||||
|
both prefill time and KV cache memory.
|
||||||
|
|
||||||
|
### 8.2 Speculative Decoding (Model-Level)
|
||||||
|
|
||||||
|
Beyond the engine-level implementation described in Section 5.3:
|
||||||
|
|
||||||
|
- **Self-speculative decoding**: Model drafts its own tokens using early exit from
|
||||||
|
lower layers. No separate draft model needed.
|
||||||
|
- **EAGLE-3**: Autoregressive head on target model internals. Higher acceptance
|
||||||
|
rates than separate draft models.
|
||||||
|
- **Draft model latency > accuracy**: Research shows that draft model speed matters
|
||||||
|
more than its language modeling accuracy for overall throughput.
|
||||||
|
|
||||||
|
### 8.3 Mixture of Depths / Mixture of Recursions
|
||||||
|
|
||||||
|
- **Mixture of Depths (MoD)**: Dynamically allocates compute to tokens that need it.
|
||||||
|
2-3x inference speedup with minimal quality degradation. Implemented at training
|
||||||
|
time -- requires model architecture support.
|
||||||
|
- **Mixture of Recursions (MoR)** (NeurIPS 2025): Combines parameter sharing with
|
||||||
|
adaptive token-level compute. Lightweight routers assign different recursion depths
|
||||||
|
to individual tokens. 2x inference throughput with reduced KV cache sizes.
|
||||||
|
|
||||||
|
**Relevance**: These are model architecture choices, not runtime optimizations.
|
||||||
|
Watch for models trained with MoD/MoR architectures.
|
||||||
|
|
||||||
|
### 8.4 Structured Pruning
|
||||||
|
|
||||||
|
Post-training methods to permanently remove model components:
|
||||||
|
|
||||||
|
- **Width pruning**: Remove neurons, attention heads, or embedding channels. Better
|
||||||
|
accuracy retention than depth pruning.
|
||||||
|
- **Depth pruning**: Remove entire layers. More latency reduction per parameter
|
||||||
|
removed.
|
||||||
|
- **LLM-Pruner, SliceGPT, FLAP**: State-of-the-art structured pruning methods.
|
||||||
|
- **AMP**: Jointly prunes attention heads and MLP neurons.
|
||||||
|
- **NIRVANA** (2025): Structured pruning reimagined for LLM compression.
|
||||||
|
|
||||||
|
**Practical approach**: Structured pruning requires per-model effort and is generally
|
||||||
|
less practical than quantization for local inference. Exception: if a specific model
|
||||||
|
is too slow at a given quantization level, pruning the model first and then
|
||||||
|
quantizing can yield a better speed/quality trade-off.
|
||||||
|
|
||||||
|
### 8.5 Token Merging and Pruning
|
||||||
|
|
||||||
|
- **TokenSelect** (EMNLP 2025): Dynamic token-level KV cache selection for
|
||||||
|
efficient long-context inference and length extrapolation.
|
||||||
|
- **LightThinker**: Step-by-step compression of chain-of-thought reasoning.
|
||||||
|
- **Attention sparsity**: Twilight (NeurIPS 2025) uses hierarchical top-p pruning
|
||||||
|
for adaptive attention sparsity.
|
||||||
|
|
||||||
|
These techniques reduce the effective sequence length during inference, directly
|
||||||
|
reducing both compute and memory bandwidth requirements.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Prioritized Recommendations for Strix Halo
|
||||||
|
|
||||||
|
### Tier 1: Implement Now (High Impact, Available Today)
|
||||||
|
|
||||||
|
1. **Use Unsloth Dynamic 2.0 GGUFs** for all models. They provide the best
|
||||||
|
quality-per-bit through intelligent layer-wise quantization.
|
||||||
|
|
||||||
|
2. **Build llama.cpp with WMMA Flash Attention**: `-DGGML_HIP_ROCWMMA_FATTN=ON
|
||||||
|
-DGGML_HIP_UMA=ON`. Monitor issue #17917 for MMQ regression fix.
|
||||||
|
|
||||||
|
3. **Disable mmap for ROCm**: Always use `--mmp 0` / `--no-mmap` to avoid the
|
||||||
|
double-copy performance penalty.
|
||||||
|
|
||||||
|
4. **Enable KV cache quantization**: Use `--cache-type-k q4_0 --cache-type-v q4_0`
|
||||||
|
for long-context workloads. Watch for TurboQuant integration.
|
||||||
|
|
||||||
|
5. **Set ROCBLAS_USE_HIPBLASLT=1**: Forces the optimized hipBLASLt kernels.
|
||||||
|
|
||||||
|
6. **Speculative decoding for decode-heavy workloads**: Use `--model-draft` with a
|
||||||
|
small model from the same family.
|
||||||
|
|
||||||
|
7. **GPU performance governor and frequency pinning**: Ensures consistent throughput.
|
||||||
|
|
||||||
|
### Tier 2: Evaluate (Moderate Impact, Some Setup Required)
|
||||||
|
|
||||||
|
8. **LLMLingua-2 for agentic/RAG workloads**: Compress long prompts before inference.
|
||||||
|
3-6x prompt processing speedup.
|
||||||
|
|
||||||
|
9. **vLLM for multi-user serving**: If running concurrent inference requests
|
||||||
|
(e.g., agentic eval pipelines), vLLM's continuous batching and PagedAttention
|
||||||
|
provide 10-20x throughput improvement.
|
||||||
|
|
||||||
|
10. **cgroups v2 memory reservation**: Prevent the OS from reclaiming GPU-mapped
|
||||||
|
memory under memory pressure.
|
||||||
|
|
||||||
|
11. **Vulkan backend for short-context workloads**: Test whether the Vulkan/RADV
|
||||||
|
path is faster than ROCm for your specific model and context length.
|
||||||
|
|
||||||
|
12. **Process pinning** with `numactl` or `taskset` for reduced scheduling jitter.
|
||||||
|
|
||||||
|
### Tier 3: Watch and Prepare (High Potential, Not Ready)
|
||||||
|
|
||||||
|
13. **KTransformers for >64 GB models**: When running DeepSeek V3 or similar models
|
||||||
|
that exceed available memory.
|
||||||
|
|
||||||
|
14. **ExLlamaV3/EXL3 AMD support**: If AMD kernels arrive, EXL3's QTIP-based
|
||||||
|
quantization could significantly improve quality at extreme compression.
|
||||||
|
|
||||||
|
15. **XDNA NPU for draft model acceleration**: If/when llama.cpp adds NPU backend
|
||||||
|
support, the NPU could run the draft model for speculative decoding.
|
||||||
|
|
||||||
|
16. **SageAttention AMD port**: 2-5x attention speedup through quantized attention.
|
||||||
|
|
||||||
|
17. **Linear attention models**: Watch for hybrid softmax/linear attention models
|
||||||
|
from major labs that would dramatically improve long-context inference.
|
||||||
|
|
||||||
|
18. **Cooperative matrices in Vulkan**: As llama.cpp's Vulkan backend matures, this
|
||||||
|
provides a portable path to WMMA acceleration without ROCm dependency.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Sources
|
||||||
|
|
||||||
|
### Papers and Conference Proceedings
|
||||||
|
|
||||||
|
- Raposo et al., "Mixture-of-Depths: Dynamically allocating compute in transformer-based language models," 2024. https://arxiv.org/abs/2404.02258
|
||||||
|
- Ainslie et al., "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints," ICML 2023. https://arxiv.org/abs/2305.13245
|
||||||
|
- Tseng et al., "QuIP#: Even Better LLM Quantization with Hadamard Incoherence and Lattice Codebooks," ICML 2024. https://arxiv.org/abs/2402.04396
|
||||||
|
- Egiazarian et al., "AQLM: Extreme Compression of Large Language Models via Additive Quantization," ICLR 2025. https://arxiv.org/abs/2401.06118
|
||||||
|
- Chen et al., "KTransformers: Unleashing the Full Potential of CPU/GPU Hybrid Inference for MoE Models," SOSP 2025. https://dl.acm.org/doi/10.1145/3731569.3764843
|
||||||
|
- Min et al., "Mixture-of-Recursions: Learning Dynamic Recursive Depths for Adaptive Token-Level Computation," NeurIPS 2025. https://arxiv.org/abs/2507.10524
|
||||||
|
- Varadarajan et al., "Characterizing and Optimizing LLM Inference Workloads on CPU-GPU Coupled Architectures," 2025. https://arxiv.org/abs/2504.11750
|
||||||
|
- Zandieh et al., "TurboQuant: Extreme KV Cache Quantization," ICLR 2026. https://github.com/ggml-org/llama.cpp/discussions/20969
|
||||||
|
- Agrawal et al., "SageAttention: Accurate 8-Bit Attention for Plug-and-play Inference Acceleration," ICLR 2025. https://arxiv.org/abs/2410.02367
|
||||||
|
- Ye et al., "FlashInfer: Efficient and Customizable Attention Engine for LLM Serving," 2025. https://arxiv.org/abs/2501.01005
|
||||||
|
- Jiang et al., "LLMLingua: Compressing Prompts for Accelerated Inference," EMNLP 2023. https://arxiv.org/abs/2310.05736
|
||||||
|
- Li et al., "A Survey on Inference Optimization Techniques for Mixture of Experts Models," 2024. https://arxiv.org/abs/2412.14219
|
||||||
|
- Liu et al., "MoE-SpeQ: Speculative Quantized Decoding with Proactive Expert Prefetching," 2025. https://arxiv.org/abs/2511.14102
|
||||||
|
- Zhou et al., "SP-MoE: Speculative Decoding and Prefetching for Accelerating MoE Inference," 2025. https://arxiv.org/abs/2510.10302
|
||||||
|
- He et al., "SliceMoE: Bit-Sliced Expert Caching under Miss-Rate Constraints," 2025. https://arxiv.org/abs/2512.12990
|
||||||
|
- Jin et al., "OD-MoE: On-Demand Expert Loading for Cacheless Edge-Distributed MoE Inference," 2025. https://arxiv.org/abs/2512.03927
|
||||||
|
|
||||||
|
### Documentation and Technical References
|
||||||
|
|
||||||
|
- AMD ROCm Strix Halo System Optimization: https://rocm.docs.amd.com/en/latest/how-to/system-optimization/strixhalo.html
|
||||||
|
- AMD GPUOpen -- Using Matrix Cores of RDNA 4: https://gpuopen.com/learn/using_matrix_core_amd_rdna4/
|
||||||
|
- AMD GPUOpen -- Accelerating Generative AI on Radeon GPUs: https://gpuopen.com/learn/accelerating_generative_ai_on_amd_radeon_gpus/
|
||||||
|
- vLLM ROCm Blog: https://blog.vllm.ai/2026/02/27/rocm-attention-backend.html
|
||||||
|
- AMD ROCm vLLM Blog: https://rocm.blogs.amd.com/software-tools-optimization/vllm-omni/README.html
|
||||||
|
- AMD AI Inference on Ryzen AI NPU with Quark: https://www.amd.com/en/developer/resources/technical-articles/2025/ai-inference-acceleration-on-ryzen-ai-with-quark.html
|
||||||
|
- Chips and Cheese -- Evaluating Infinity Cache in Strix Halo: https://chipsandcheese.com/p/evaluating-the-infinity-cache-in
|
||||||
|
- Chips and Cheese -- RDNA 4 Architecture at Hot Chips 2025: https://chipsandcheese.com/p/amds-rdna4-gpu-architecture-at-hot
|
||||||
|
- Linux Kernel XDNA NPU Documentation: https://docs.kernel.org/accel/amdxdna/amdnpu.html
|
||||||
|
|
||||||
|
### Community Resources and Guides
|
||||||
|
|
||||||
|
- llama.cpp ROCm Performance Discussion: https://github.com/ggml-org/llama.cpp/discussions/15021
|
||||||
|
- llama.cpp Strix Halo UMA Detection Bug: https://github.com/ggml-org/llama.cpp/issues/18159
|
||||||
|
- llama.cpp Strix Halo Performance Regression: https://github.com/ggml-org/llama.cpp/issues/17917
|
||||||
|
- Strix Halo Wiki -- llama.cpp with ROCm: https://strixhalo.wiki/AI/llamacpp-with-ROCm
|
||||||
|
- Strix Halo Wiki -- Performance: https://strixhalo.wiki/AI/llamacpp-performance
|
||||||
|
- AMD Strix Halo Toolboxes: https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||||
|
- LLM Tracker -- AMD GPUs: https://llm-tracker.info/howto/AMD-GPUs
|
||||||
|
- LLM Tracker -- Strix Halo: https://llm-tracker.info/_TOORG/Strix-Halo
|
||||||
|
- Unsloth Dynamic 2.0 Documentation: https://unsloth.ai/docs/basics/unsloth-dynamic-2.0-ggufs
|
||||||
|
- Unsloth Dynamic v2.0 Blog: https://unsloth.ai/blog/dynamic-v2
|
||||||
|
- KTransformers GitHub: https://github.com/kvcache-ai/ktransformers
|
||||||
|
- ExLlamaV3 GitHub: https://github.com/turboderp-org/exllamav3
|
||||||
|
- BitNet GitHub: https://github.com/microsoft/BitNet
|
||||||
|
- LLMLingua GitHub: https://github.com/microsoft/LLMLingua
|
||||||
|
- MoE Inference Awesome List: https://github.com/MoE-Inf/awesome-moe-inference
|
||||||
|
- Awesome LLM Inference: https://github.com/xlite-dev/Awesome-LLM-Inference
|
||||||
|
- Phoronix -- ROCm 7.1 vs Vulkan on AI PRO R9700: https://www.phoronix.com/review/rocm-71-llama-cpp-vulkan
|
||||||
|
- eunomia -- OS-Level LLM Inference Optimizations: https://eunomia.dev/blog/2025/02/18/os-level-challenges-in-llm-inference-and-optimizations/
|
||||||
|
- RADV Cooperative Matrix for RDNA4: https://www.phoronix.com/forums/forum/phoronix/latest-phoronix-articles/1524861-vulkan-cooperative-matrix-merged-for-rdna4-gpus-with-radv-dcc-support-inches-closer
|
||||||
|
- Kaitchup -- GGUF Quant Selection: https://kaitchup.substack.com/p/choosing-a-gguf-model-k-quants-i
|
||||||
806
docs/llama-cpp-optimization-research.md
Normal file
806
docs/llama-cpp-optimization-research.md
Normal file
@@ -0,0 +1,806 @@
|
|||||||
|
# llama.cpp Runtime and Compilation Optimization for AMD RDNA 3.5 (gfx1151)
|
||||||
|
|
||||||
|
Comprehensive research into maximizing inference performance on AMD Strix Halo
|
||||||
|
(Ryzen AI MAX+ 395, Radeon 8060S gfx1151, 64 GB unified LPDDR5x-8000).
|
||||||
|
Researched March 2026.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This document covers every known compilation flag, runtime parameter, environment
|
||||||
|
variable, and architectural optimization for llama.cpp targeting gfx1151 (RDNA 3.5)
|
||||||
|
with both ROCm/HIP and Vulkan backends on Fedora. It does not cover vLLM, ollama
|
||||||
|
internals, or non-llama.cpp inference engines except where their findings inform
|
||||||
|
llama.cpp optimization.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Compilation Flags and Build Optimizations](#1-compilation-flags-and-build-optimizations)
|
||||||
|
2. [Runtime Flags and Environment Variables](#2-runtime-flags-and-environment-variables)
|
||||||
|
3. [Flash Attention and Attention Backends](#3-flash-attention-and-attention-backends)
|
||||||
|
4. [Quantization Strategies for Speed](#4-quantization-strategies-for-speed)
|
||||||
|
5. [Memory Layout and Caching](#5-memory-layout-and-caching)
|
||||||
|
6. [llama-server Specific Optimizations](#6-llama-server-specific-optimizations)
|
||||||
|
7. [Upcoming llama.cpp Features (2026)](#7-upcoming-llamacpp-features-2026)
|
||||||
|
8. [Recommended Configurations](#8-recommended-configurations)
|
||||||
|
9. [Sources](#9-sources)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Compilation Flags and Build Optimizations
|
||||||
|
|
||||||
|
### 1.1 GGML_HIP (ROCm) vs GGML_VULKAN: Which Backend to Build
|
||||||
|
|
||||||
|
Both backends are worth building. Neither is universally faster on gfx1151:
|
||||||
|
|
||||||
|
| Workload | Winner | Rationale |
|
||||||
|
|----------|--------|-----------|
|
||||||
|
| Token generation (short ctx) | Vulkan RADV | Lower driver overhead, mature kernel paths |
|
||||||
|
| Token generation (long ctx, 8K+) | ROCm + rocWMMA + FA | Maintains speed as context grows; uses less memory |
|
||||||
|
| Prompt processing (short ctx) | Mixed -- model-dependent | AMDVLK or ROCm hipBLASLt win on some shapes |
|
||||||
|
| Prompt processing (long ctx) | ROCm + rocWMMA-tuned | 96% speedup over untuned rocWMMA at 65K ctx |
|
||||||
|
| Memory efficiency at long ctx | ROCm + FA | Less memory than Vulkan equivalent |
|
||||||
|
|
||||||
|
Benchmark data (Qwen3-30B-A3B UD-Q4_K_XL, gfx1151, flash attention on):
|
||||||
|
|
||||||
|
| Backend | pp512 t/s | tg128 t/s | pp512@130K t/s | tg128@130K t/s |
|
||||||
|
|---------|-----------|-----------|----------------|----------------|
|
||||||
|
| Vulkan RADV | 755.14 | 85.11 | 17.24 | 12.54 |
|
||||||
|
| Vulkan AMDVLK | 741.60 | 81.79 | 10.75 | 3.51 |
|
||||||
|
| ROCm hipBLASLt | 651.93 | 63.95 | 40.35 | 4.97 |
|
||||||
|
| ROCm rocWMMA-tuned | 659.07 | 67.66 | 51.12 | 13.33 |
|
||||||
|
|
||||||
|
Key insight: RADV scales significantly better than AMDVLK for long contexts (3.6x
|
||||||
|
faster tg at 130K depth). ROCm with tuned rocWMMA provides the best long-context
|
||||||
|
prompt processing (3x faster than RADV at 130K).
|
||||||
|
|
||||||
|
### 1.2 ROCm Build Flags
|
||||||
|
|
||||||
|
**Minimal build:**
|
||||||
|
```bash
|
||||||
|
cmake -B build -S . \
|
||||||
|
-DGGML_HIP=ON \
|
||||||
|
-DAMDGPU_TARGETS="gfx1151" \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Optimized build (recommended):**
|
||||||
|
```bash
|
||||||
|
cmake -B build -S . \
|
||||||
|
-DGGML_HIP=ON \
|
||||||
|
-DAMDGPU_TARGETS="gfx1151" \
|
||||||
|
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Critical ROCm build flags:
|
||||||
|
|
||||||
|
| Flag | Effect | Recommendation |
|
||||||
|
|------|--------|----------------|
|
||||||
|
| `-DGGML_HIP=ON` | Enable HIP/ROCm backend | Required |
|
||||||
|
| `-DAMDGPU_TARGETS="gfx1151"` | Target Strix Halo GPU | Required -- do not use gfx1100 |
|
||||||
|
| `-DGGML_HIP_ROCWMMA_FATTN=ON` | Enable rocWMMA flash attention | Strongly recommended for pp |
|
||||||
|
| `-DGGML_HIP_GRAPHS=ON` | HIP graph kernel scheduling | Test -- may help reduce launch overhead |
|
||||||
|
| `-DGGML_HIP_NO_VMM=OFF` | Re-enable Virtual Memory Management | Default is disabled; test if needed |
|
||||||
|
|
||||||
|
#### Flags to be aware of but NOT set by default:
|
||||||
|
|
||||||
|
| Flag | Notes |
|
||||||
|
|------|-------|
|
||||||
|
| `-DGGML_HIP_UMA=ON` | Uses hipMallocManaged for UMA. **Avoid on Strix Halo** -- it uses fine-grained memory that is significantly slower. Standard hipMalloc + GTT expansion via kernel params is faster. |
|
||||||
|
| `-DGGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` | Forces FP16 compute in hipBLAS. Documented for RDNA4 -- may help pp performance on gfx1151. **Test before deploying.** |
|
||||||
|
| `-DGGML_CUDA_FA_ALL_QUANTS=ON` | Compiles all KV cache quant type combinations for FA. **Works for CUDA kernels which HIP reuses via hipify.** Increases compilation time substantially. Enable if you need quantized KV cache with flash attention. |
|
||||||
|
|
||||||
|
#### ROCm version considerations:
|
||||||
|
|
||||||
|
- **ROCm 7.2**: Known rocWMMA compilation issue (ambiguous template specializations in `mfma_impl.hpp`). Fixed in later point releases or by disabling rocWMMA.
|
||||||
|
- **ROCm 7.0 RC / 7.1**: Generally work well with gfx1151.
|
||||||
|
- **ROCm 6.4.4**: Some users report better performance than 7.x for certain workloads. The gfx1151 rocBLAS kernel regression means hipBLASLt is essential.
|
||||||
|
- **Recommendation**: Use ROCm 7.2+ with rocWMMA patches, or pre-built toolbox containers that have been validated for gfx1151.
|
||||||
|
|
||||||
|
### 1.3 Vulkan Build Flags
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build -S . \
|
||||||
|
-DGGML_VULKAN=ON \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
The Vulkan build auto-detects cooperative matrix support (KHR_coopmat) at runtime.
|
||||||
|
gfx1151 with RADV reports `matrix cores: KHR_coopmat` in llama-bench logs.
|
||||||
|
|
||||||
|
#### Vulkan-specific considerations:
|
||||||
|
|
||||||
|
| Topic | Detail |
|
||||||
|
|-------|--------|
|
||||||
|
| **RADV vs AMDVLK** | RADV (Mesa) is recommended for gfx1151. Better long-context scaling, no 2GB buffer allocation limit. |
|
||||||
|
| **AMDVLK buffer limit** | AMDVLK caps single Vulkan allocations at ~2 GiB (`VkPhysicalDeviceLimits::maxMemoryAllocationSize`). RADV allows ~4 GiB. This causes OOM for models with large compute buffers. |
|
||||||
|
| **CoopMat1 vs CoopMat2** | gfx1151 supports KHR_coopmat (CoopMat1). CoopMat2 (`VK_NV_cooperative_matrix2`) is NVIDIA-only. This means Vulkan flash attention on AMD falls back to CPU -- use ROCm for GPU-accelerated FA. |
|
||||||
|
| **Shader compilation** | Building from source with `glslc` available enables cooperative matrix shader variants. Pre-built binaries may omit them. |
|
||||||
|
|
||||||
|
### 1.4 LTO and PGO
|
||||||
|
|
||||||
|
llama.cpp does not have built-in LTO/PGO support in its CMake configuration.
|
||||||
|
You can enable LTO manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build -S . \
|
||||||
|
-DGGML_HIP=ON \
|
||||||
|
-DAMDGPU_TARGETS="gfx1151" \
|
||||||
|
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected benefit: 2-5% improvement in CPU-bound paths. The GPU kernels are
|
||||||
|
compiled by the HIP/ROCm compiler and are not affected by host LTO.
|
||||||
|
|
||||||
|
PGO would require a two-pass build (instrument, profile, rebuild) and is not
|
||||||
|
commonly done for llama.cpp. The dominant bottleneck is GPU kernel performance
|
||||||
|
and memory bandwidth, not host-side code paths.
|
||||||
|
|
||||||
|
### 1.5 Compiler Tuning for ROCm
|
||||||
|
|
||||||
|
A known LLVM regression affects loop unrolling on RDNA. The following flag has
|
||||||
|
been reported to help:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-DCMAKE_CXX_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600"
|
||||||
|
```
|
||||||
|
|
||||||
|
This increases the unrolling threshold for local memory operations, which can
|
||||||
|
improve kernel performance for flash attention and matrix multiplication.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Runtime Flags and Environment Variables
|
||||||
|
|
||||||
|
### 2.1 ROCm Environment Variables
|
||||||
|
|
||||||
|
| Variable | Value | Effect |
|
||||||
|
|----------|-------|--------|
|
||||||
|
| `ROCBLAS_USE_HIPBLASLT=1` | **Critical** | Switches from rocBLAS tensile kernels to hipBLASLt. On gfx1151, default rocBLAS achieves only 5.76 TFLOPS (<9% efficiency). hipBLASLt achieves >60% efficiency. **This is a 2-7x improvement for prompt processing.** |
|
||||||
|
| `HSA_OVERRIDE_GFX_VERSION=11.5.1` | Set inside toolbox containers | Required for ROCm to recognize gfx1151. Set in container, not by host scripts. |
|
||||||
|
| `HSA_ENABLE_SDMA=0` | Optional | Disables SDMA engine. May help on some configurations, but generally not needed on Strix Halo with recent kernels. |
|
||||||
|
| `HIP_VISIBLE_DEVICES=0` | Optional | Select specific GPU device. Useful in multi-GPU or container setups. |
|
||||||
|
| `GPU_MAX_HEAP_SIZE=100` | Optional | Allow 100% of GPU memory for heap. Default may be lower. |
|
||||||
|
| `GPU_MAX_ALLOC_PERCENT=100` | Optional | Allow single allocation up to 100% of GPU memory. |
|
||||||
|
| `ROCR_VISIBLE_DEVICES=0` | Optional | HSA-level device visibility control. |
|
||||||
|
| `AMD_LOG_LEVEL=0` | Optional | Suppress AMD driver logging noise. |
|
||||||
|
|
||||||
|
**The single most impactful environment variable is `ROCBLAS_USE_HIPBLASLT=1`.**
|
||||||
|
Without it, ROCm pp512 on Llama-2-7B drops from 882 t/s to 348 t/s (4x slower).
|
||||||
|
|
||||||
|
### 2.2 Vulkan Environment Variables
|
||||||
|
|
||||||
|
| Variable | Value | Effect |
|
||||||
|
|----------|-------|--------|
|
||||||
|
| `AMD_VULKAN_ICD=RADV` | Recommended | Force RADV driver (skip AMDVLK). |
|
||||||
|
| `RADV_PERFTEST=nogttspill` | **Important** | Fixes GTT memory spilling issues on RADV. Can resolve significant pp performance drops (especially with FA off). |
|
||||||
|
| `GGML_VK_VISIBLE_DEVICES=0` | Optional | Select Vulkan device index. |
|
||||||
|
| `GGML_VULKAN_DISABLE_F16=1` | Debugging | Force FP32 compute. Slower but useful for debugging precision issues. |
|
||||||
|
| `GGML_LOG_LEVEL=2` | Debugging | Verbose logging to verify coopmat detection. |
|
||||||
|
|
||||||
|
### 2.3 Thread Count (`-t` flag)
|
||||||
|
|
||||||
|
For GPU-dominant inference (all layers offloaded), the thread count has minimal
|
||||||
|
impact on throughput. The recommendation:
|
||||||
|
|
||||||
|
- **Single-user inference**: `-t 4` to `-t 8` (enough for tokenization/sampling overhead)
|
||||||
|
- **Server with parallel slots**: `-t` equal to physical core count (12 on Ryzen AI MAX+ 395 = 12 Zen 5 cores)
|
||||||
|
- **Hybrid CPU+GPU (partial offload)**: `-t` equal to number of physical cores
|
||||||
|
|
||||||
|
The Ryzen AI MAX+ 395 has 16 cores (12 Zen 5 + 4 Zen 5c). For llama.cpp, using
|
||||||
|
all 12 big cores (`-t 12`) is optimal for CPU-involved workloads.
|
||||||
|
|
||||||
|
### 2.4 Batch Size Tuning (`-b` and `-ub`)
|
||||||
|
|
||||||
|
| Flag | Default | Role |
|
||||||
|
|------|---------|------|
|
||||||
|
| `-b` / `--batch-size` | 2048 | Logical batch size (application level) |
|
||||||
|
| `-ub` / `--ubatch-size` | 512 | Physical batch size (device level) |
|
||||||
|
|
||||||
|
Tuning guidance for gfx1151:
|
||||||
|
|
||||||
|
- **MoE models**: `-b 256` significantly improves pp512 (reported 70% improvement on Qwen3-30B-A3B)
|
||||||
|
- **Dense models**: Default `-b 2048` is generally fine
|
||||||
|
- **Long context**: `-ub 2048` can improve performance, but test against OOM
|
||||||
|
- **Ultra-long context**: Reduce `-ub` if memory allocation fails
|
||||||
|
|
||||||
|
The Vulkan backend blog post for Strix Halo recommends: `-c 32768 -b 4096 -ub 256`
|
||||||
|
for a good balance of performance and memory.
|
||||||
|
|
||||||
|
### 2.5 Memory-Mapped Loading (`-mmp` / `--no-mmap`)
|
||||||
|
|
||||||
|
**Critical finding for unified memory APUs:**
|
||||||
|
|
||||||
|
> When you load large models to the GPU, memory mapping can make loading moderately
|
||||||
|
> slower for Vulkan, and **catastrophically slower for ROCm**. You should always set
|
||||||
|
> `--mmap 0` or `--no-mmap` to improve model loading times on Strix Halo.
|
||||||
|
|
||||||
|
For `llama-bench`, use `-mmp 0`. For `llama-server`/`llama-cli`, use `--no-mmap`.
|
||||||
|
|
||||||
|
On Strix Halo, both "GPU memory" and "CPU memory" share the same physical LPDDR5x.
|
||||||
|
The difference is which pages are mapped for GPU access. GPU-mapped pages have full
|
||||||
|
bandwidth (~215 GB/s). CPU-accessed pages get approximately half (~84 GB/s for
|
||||||
|
CPU-to-GPU copies).
|
||||||
|
|
||||||
|
**Always use `-ngl 99` (or higher) to ensure all layers are on GPU memory.**
|
||||||
|
Even on a unified memory system, GPU memory paths provide 2x the bandwidth.
|
||||||
|
|
||||||
|
### 2.6 GPU Layer Offloading (`-ngl`)
|
||||||
|
|
||||||
|
For Strix Halo with 64GB unified memory:
|
||||||
|
|
||||||
|
- **Models < 50GB**: `-ngl 99` offloads everything. No tuning needed.
|
||||||
|
- **Models 50-60GB**: `-ngl 99` should still work with GTT expanded via kernel params.
|
||||||
|
- **Models > 60GB**: May need partial offload. Use `-ngl <N>` where N is tuned to
|
||||||
|
keep GPU memory under the GTT limit. Remaining layers run on CPU at ~1/2 bandwidth.
|
||||||
|
|
||||||
|
**Never let GPU spill to system RAM paths** -- performance will be worse than pure CPU.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Flash Attention and Attention Backends
|
||||||
|
|
||||||
|
### 3.1 When to Enable Flash Attention
|
||||||
|
|
||||||
|
**Rule of thumb for gfx1151:**
|
||||||
|
|
||||||
|
| Backend | Flash Attention | Recommendation |
|
||||||
|
|---------|----------------|----------------|
|
||||||
|
| ROCm + rocWMMA | `-fa 1` | **Always enable.** 24% pp improvement, maintains tg speed, uses less memory. |
|
||||||
|
| ROCm without rocWMMA | `-fa 1` | Enable, but smaller improvement. |
|
||||||
|
| Vulkan RADV | `-fa 1` | **Enable for short contexts.** Minor improvement at pp512/tg128. At long contexts, Vulkan FA may degrade performance. |
|
||||||
|
| Vulkan AMDVLK | `-fa 1` | Similar to RADV. |
|
||||||
|
|
||||||
|
**Key caveat**: Vulkan flash attention on AMD uses CoopMat1 (KHR_coopmat), not
|
||||||
|
the more efficient CoopMat2 (NVIDIA-only). For AMD, ROCm + rocWMMA is the superior
|
||||||
|
FA path.
|
||||||
|
|
||||||
|
### 3.2 rocWMMA Flash Attention Performance
|
||||||
|
|
||||||
|
Benchmark on gfx1151 (Llama2-7B Q4_K_M):
|
||||||
|
|
||||||
|
| Configuration | pp512 t/s | tg128 t/s |
|
||||||
|
|---------------|-----------|-----------|
|
||||||
|
| HIP standard | 592.28 | 40.40 |
|
||||||
|
| HIP + hipBLASLt | 548.72 | 40.43 |
|
||||||
|
| HIP + rocWMMA + hipBLASLt | 1006.80 | 39.46 |
|
||||||
|
| HIP + rocWMMA (no hipBLASLt) | 899.73 | 39.45 |
|
||||||
|
|
||||||
|
rocWMMA provides ~70% improvement in prompt processing with flash attention.
|
||||||
|
Token generation is slightly slower (~2%) due to WMMA overhead at small batch.
|
||||||
|
|
||||||
|
### 3.3 The rocWMMA Long-Context Regression and Fix
|
||||||
|
|
||||||
|
The standard rocWMMA implementation has a **long-context decode regression**:
|
||||||
|
at 65K context, tg degrades by up to 57% compared to HIP-only baseline.
|
||||||
|
|
||||||
|
**The fix** (PR #16827, "rocm-wmma-tune" branch) implements:
|
||||||
|
|
||||||
|
1. **`__launch_bounds__(256, 2)`**: Ensures minimum 2 blocks per SM, improving occupancy
|
||||||
|
2. **Adaptive KQ stride**: Uses stride 128 when head dimension <= 128, reducing LDS footprint
|
||||||
|
3. **Selective WMMA usage**: WMMA only for prefill; decode reverts to VEC/TILE kernels
|
||||||
|
|
||||||
|
Results after fix (Llama 3.2 1B Q4_K_M on gfx1151):
|
||||||
|
- pp512 at 65K context: **96% faster** than untuned rocWMMA
|
||||||
|
- tg128 at 65K context: Matches HIP baseline (previously 57% degraded)
|
||||||
|
|
||||||
|
**Status**: This patch is available in `-rocwmma-improved` toolbox builds. It may
|
||||||
|
not be merged into upstream llama.cpp. Check Donato Capitella's toolboxes.
|
||||||
|
|
||||||
|
### 3.4 Vulkan Flash Attention Limitations on AMD
|
||||||
|
|
||||||
|
The Vulkan backend supports three FA paths:
|
||||||
|
|
||||||
|
| Path | Extension | AMD Support |
|
||||||
|
|------|-----------|-------------|
|
||||||
|
| FA_SCALAR | None | Yes (CPU fallback) |
|
||||||
|
| FA_COOPMAT1 | KHR_cooperative_matrix | Yes (gfx1151 reports support) |
|
||||||
|
| FA_COOPMAT2 | NV_cooperative_matrix2 | **No** (NVIDIA-only) |
|
||||||
|
|
||||||
|
FA_COOPMAT1 supports: f16, q4_0, q8_0, f32 KV cache types.
|
||||||
|
FA_COOPMAT2 additionally supports all quant types.
|
||||||
|
|
||||||
|
When Vulkan FA is enabled on AMD with RADV, it uses CoopMat1 for matrix operations.
|
||||||
|
This provides a modest improvement over scalar FA but is significantly less
|
||||||
|
efficient than ROCm + rocWMMA.
|
||||||
|
|
||||||
|
### 3.5 New Attention Models (GatedDeltaNet)
|
||||||
|
|
||||||
|
Models using GatedDeltaNet architecture (Qwen3.5-27B, Qwen3.5-35B-A3B) have
|
||||||
|
severe performance problems on gfx1151:
|
||||||
|
|
||||||
|
- **Vulkan**: No GATED_DELTA_NET compute shader exists; ops fall back to CPU
|
||||||
|
- **ROCm/HIP**: Kernel cross-compiles but suffers from register spilling (float s[S_v]
|
||||||
|
allocates up to 512 bytes per thread) and hipMemcpyWithStream bottleneck (92-95%
|
||||||
|
of decode time on models >15GB)
|
||||||
|
|
||||||
|
Result: Qwen3.5-27B runs at ~12 t/s on gfx1151 vs expected 50-80 t/s.
|
||||||
|
**Avoid GatedDeltaNet models on gfx1151 until kernel optimization lands.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Quantization Strategies for Speed
|
||||||
|
|
||||||
|
### 4.1 Quantization Speed on RDNA 3.5
|
||||||
|
|
||||||
|
Token generation speed is dominated by memory bandwidth, not compute. Smaller
|
||||||
|
quantizations are faster because they reduce bytes-per-weight, allowing more
|
||||||
|
tokens per second within the ~215 GB/s bandwidth envelope.
|
||||||
|
|
||||||
|
Approximate throughput formula for decode (bandwidth-bound):
|
||||||
|
```
|
||||||
|
tg_tokens/s ≈ effective_bandwidth_GB/s / model_size_bytes * 1e9
|
||||||
|
```
|
||||||
|
|
||||||
|
For a 7B Q4_K_M model (~4.1 GB):
|
||||||
|
```
|
||||||
|
215 / 4.1 ≈ 52 t/s (theoretical max; practical ~50 t/s on gfx1151)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Quantization Type Comparison
|
||||||
|
|
||||||
|
| Quant | Bits/Weight | Quality | Speed (relative) | Notes |
|
||||||
|
|-------|------------|---------|-------------------|-------|
|
||||||
|
| Q4_0 | 4.0 | Low | Fastest | Legacy. Simple dequant. |
|
||||||
|
| Q4_K_M | 4.83 | Good | Very fast | K-quant with hierarchical blocks. Recommended default. |
|
||||||
|
| IQ4_XS | 4.25 | Good | Fast | Importance-weighted. Better quality/bit than Q4_K_M. |
|
||||||
|
| Q5_K_M | 5.69 | Very good | Fast | Sweet spot for quality-sensitive use. |
|
||||||
|
| Q6_K | 6.56 | Excellent | Moderate | Near-lossless quality. |
|
||||||
|
| Q8_0 | 8.0 | Near-perfect | Slower | 2x the bytes of Q4_K_M, ~2x slower tg. |
|
||||||
|
| F16 | 16.0 | Perfect | Slowest | Reference baseline. |
|
||||||
|
|
||||||
|
**For RDNA 3.5 specifically**:
|
||||||
|
|
||||||
|
- **Q4_K_M** is the best general-purpose quantization. The K-quant family uses
|
||||||
|
hierarchical super-blocks (256 values) with per-sub-block scales, providing
|
||||||
|
better quality than Q4_0 at marginally higher dequant cost that is invisible
|
||||||
|
at the GPU level.
|
||||||
|
|
||||||
|
- **Q4_0** has the simplest dequant kernels and is marginally faster than Q4_K_M
|
||||||
|
on some GPU backends. However, the quality loss is significant. Use only for
|
||||||
|
smoke tests or when every t/s matters more than quality.
|
||||||
|
|
||||||
|
- **IQ4_XS** (importance-matrix quantized) offers better quality per bit than
|
||||||
|
Q4_K_M. Speed is similar. Requires an importance matrix file during quantization.
|
||||||
|
**Recommended over Q4_K_M when you control the quantization process.**
|
||||||
|
|
||||||
|
- **Q8_0** does NOT have special hardware-accelerated dequant on RDNA 3.5.
|
||||||
|
RDNA 3.5 lacks INT8 tensor core equivalents. Q8_0 performance relies on the
|
||||||
|
same FP16 compute paths, just with more memory bandwidth consumed.
|
||||||
|
|
||||||
|
### 4.3 Importance Matrix (imatrix) Quantization
|
||||||
|
|
||||||
|
imatrix quantization records how much each weight affects output quality, then
|
||||||
|
allocates more precision bits to important weights. This is essential for
|
||||||
|
sub-4-bit quantizations (IQ2_XS, IQ3_XXS, IQ4_XS) where standard K-quant
|
||||||
|
shows measurable degradation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate importance matrix (GPU-accelerated)
|
||||||
|
llama-imatrix -m model-f16.gguf -f calibration_data.txt -ngl 99 -o imatrix.dat
|
||||||
|
|
||||||
|
# Quantize with imatrix
|
||||||
|
llama-quantize --imatrix imatrix.dat model-f16.gguf model-iq4_xs.gguf IQ4_XS
|
||||||
|
```
|
||||||
|
|
||||||
|
**Speed impact**: None. imatrix affects quantization quality, not inference speed.
|
||||||
|
The dequantization kernels are identical regardless of whether imatrix was used.
|
||||||
|
|
||||||
|
### 4.4 Unsloth Dynamic (UD) Quantizations
|
||||||
|
|
||||||
|
Unsloth Dynamic 2.0 selectively quantizes different layers at different bit widths,
|
||||||
|
choosing the optimal quantization per layer based on sensitivity analysis.
|
||||||
|
|
||||||
|
**Speed impact**: Minimal to none. UD quants use the same dequant kernels as
|
||||||
|
standard GGUF quantizations. A UD-Q4_K_XL file runs at the same speed as a
|
||||||
|
standard Q4_K_M of the same total size.
|
||||||
|
|
||||||
|
**Quality impact**: Significantly better. UD consistently outperforms standard
|
||||||
|
quantizations in 5-shot MMLU and KL divergence metrics at the same total file size.
|
||||||
|
|
||||||
|
**Recommendation**: Prefer UD quants (e.g., `UD-Q4_K_XL`, `UD-Q4_K_M`) from
|
||||||
|
Unsloth when available. They are a free quality upgrade with no speed penalty.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Memory Layout and Caching
|
||||||
|
|
||||||
|
### 5.1 KV Cache Quantization
|
||||||
|
|
||||||
|
KV cache quantization reduces the memory footprint of the attention cache,
|
||||||
|
allowing larger context windows within the same memory budget.
|
||||||
|
|
||||||
|
| Cache Type | Memory vs F16 | Quality Impact | Recommendation |
|
||||||
|
|------------|--------------|----------------|----------------|
|
||||||
|
| f16 (default) | 1.0x | None | Baseline |
|
||||||
|
| q8_0 | 0.5x | Negligible (+0.002-0.05 ppl) | **Recommended for production** |
|
||||||
|
| q4_0 | 0.33x | Noticeable (+0.2-0.25 ppl) | Use when memory-constrained |
|
||||||
|
| q4_1 | 0.33x | Slightly better than q4_0 | Alternative to q4_0 |
|
||||||
|
| iq4_nl | 0.33x | Better than q4_0 | Best 4-bit KV option |
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
```bash
|
||||||
|
llama-server -m model.gguf --cache-type-k q8_0 --cache-type-v q8_0 ...
|
||||||
|
# or for llama-bench:
|
||||||
|
# Not directly supported as flags; test via llama-server
|
||||||
|
```
|
||||||
|
|
||||||
|
**Performance impact**: Quantizing K cache slightly **improves** throughput
|
||||||
|
(less memory to read). Quantizing V cache may have a slight negative impact.
|
||||||
|
Overall performance impact is negligible for normal inference.
|
||||||
|
|
||||||
|
**Caveat with speculative decoding**: Using KV cache quantization with a draft
|
||||||
|
model causes a consistent ~16% performance drop. q4_0 KV with speculative
|
||||||
|
decoding causes massive acceptance rate drops. **Avoid KV quant if using
|
||||||
|
speculative decoding.**
|
||||||
|
|
||||||
|
### 5.2 mmap vs Full Load on Unified Memory
|
||||||
|
|
||||||
|
On Strix Halo's unified memory architecture:
|
||||||
|
|
||||||
|
- **`--no-mmap` is strongly recommended** for both ROCm and Vulkan.
|
||||||
|
- With mmap enabled, ROCm model loading is "catastrophically slower."
|
||||||
|
- Vulkan loading is "moderately slower" with mmap.
|
||||||
|
- Since CPU and GPU share physical RAM, there is no data copy when loading
|
||||||
|
to "GPU memory" -- it is just a page table update.
|
||||||
|
|
||||||
|
For `llama-bench`: Always use `-mmp 0`.
|
||||||
|
For `llama-server`/`llama-cli`: Always use `--no-mmap`.
|
||||||
|
|
||||||
|
### 5.3 Prompt Caching
|
||||||
|
|
||||||
|
llama-server supports two levels of prompt caching:
|
||||||
|
|
||||||
|
**1. Automatic KV cache reuse (`cache_prompt: true`)**:
|
||||||
|
Reuses KV cache from previous requests when prompts share a common prefix.
|
||||||
|
The server only reprocesses the suffix that differs.
|
||||||
|
|
||||||
|
**2. Host-memory prompt caching (`--cram N`)**:
|
||||||
|
Stores pre-computed prompt representations in system RAM.
|
||||||
|
- Reduces TTFT from ~4.2s to ~0.3s for cached requests (93% reduction)
|
||||||
|
- +6% token throughput (34 vs 32 t/s)
|
||||||
|
- Memory formula: `num_prefixes * avg_prefix_tokens * 8 bytes`
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
```bash
|
||||||
|
llama-server -m model.gguf \
|
||||||
|
--cram 256 \ # 256 MB host RAM for prompt cache
|
||||||
|
--cache-type-k q8_0 \ # KV cache quantization
|
||||||
|
--cache-type-v q8_0 \
|
||||||
|
--no-mmap \
|
||||||
|
-fa \
|
||||||
|
-ngl 99
|
||||||
|
```
|
||||||
|
|
||||||
|
Best for:
|
||||||
|
- System prompts > 5K tokens
|
||||||
|
- Multi-user chatbots with shared context
|
||||||
|
- Agentic use with repeated tool-call prefixes
|
||||||
|
|
||||||
|
### 5.4 UMA Detection Bug (Issue #18159)
|
||||||
|
|
||||||
|
llama.cpp's UMA detection (from PR #17368, designed for NVIDIA DGX Spark)
|
||||||
|
incorrectly activates on AMD APUs when `prop.integrated=1`. It reads
|
||||||
|
`/proc/meminfo` instead of `hipMemGetInfo()`, severely underreporting available
|
||||||
|
GPU memory (e.g., reporting 27GB instead of 96GB).
|
||||||
|
|
||||||
|
**Workarounds**:
|
||||||
|
- Build without `GGML_CUDA_ENABLE_UNIFIED_MEMORY`
|
||||||
|
- Guard UMA detection with `!defined(GGML_USE_HIP)` (upstream fix pending)
|
||||||
|
- Use toolbox containers where this has been patched
|
||||||
|
|
||||||
|
### 5.5 KV Cache Placement on ROCm (Issue #18011)
|
||||||
|
|
||||||
|
On Strix Halo, the ROCm backend may dump KV cache into shared (CPU-accessible)
|
||||||
|
memory instead of GPU-mapped memory, causing performance degradation at high
|
||||||
|
context sizes. This is a known issue contributing to ROCm falling behind Vulkan
|
||||||
|
for tg at high contexts.
|
||||||
|
|
||||||
|
**Mitigation**: Use the rocWMMA-tuned branch which maintains better memory
|
||||||
|
placement, or use Vulkan RADV for workloads where this matters.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. llama-server Specific Optimizations
|
||||||
|
|
||||||
|
### 6.1 Recommended Server Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-server -m model.gguf \
|
||||||
|
--host 0.0.0.0 --port 8080 \
|
||||||
|
-ngl 99 \
|
||||||
|
--no-mmap \
|
||||||
|
-fa \
|
||||||
|
-c 32768 \ # Total context across all slots
|
||||||
|
-np 4 \ # 4 parallel slots (adjust for your use)
|
||||||
|
-b 2048 \ # Logical batch size
|
||||||
|
-ub 512 \ # Physical batch size
|
||||||
|
--cache-type-k q8_0 \
|
||||||
|
--cache-type-v q8_0 \
|
||||||
|
--cont-batching \ # Enabled by default
|
||||||
|
--jinja # Enable Jinja2 chat template
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 Parallel Slot Configuration (`-np`)
|
||||||
|
|
||||||
|
| Use Case | Slots | Context per Slot | Total `-c` |
|
||||||
|
|----------|-------|-----------------|-----------|
|
||||||
|
| Single user chat | 1 | 32768 | 32768 |
|
||||||
|
| Agentic coding (Claude Code style) | 2-4 | 8192-16384 | 32768-65536 |
|
||||||
|
| Multi-user API | 4-8 | 4096-8192 | 32768-65536 |
|
||||||
|
| Eval harness | 1 | 32768+ | 32768+ |
|
||||||
|
|
||||||
|
Memory formula: Each slot requires `context_size * 2 * hidden_dim * n_layers * bytes_per_kv_element`.
|
||||||
|
With q8_0 KV cache, this is roughly halved compared to f16.
|
||||||
|
|
||||||
|
### 6.3 Continuous Batching
|
||||||
|
|
||||||
|
Enabled by default (`--cont-batching`). Allows the server to process multiple
|
||||||
|
requests simultaneously, interleaving prefill and decode operations.
|
||||||
|
|
||||||
|
For agentic workloads: One slot typically holds a large system prompt + conversation
|
||||||
|
context, while additional slots handle parallel tool calls. Configure with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
-np 4 -c 131072 # 4 slots, up to 32K context each
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.4 Prompt Caching for Agentic Use
|
||||||
|
|
||||||
|
For agentic coding tools that send the same system prompt repeatedly:
|
||||||
|
|
||||||
|
1. Use `cache_prompt: true` in API requests (reuses KV cache prefix)
|
||||||
|
2. Use `--system-prompt-file system.txt` for static system prompts (note: may be
|
||||||
|
removed in recent versions; verify with your build)
|
||||||
|
3. Use `--cram 128` to enable host-memory caching for prefix deduplication
|
||||||
|
|
||||||
|
### 6.5 Speculative Decoding
|
||||||
|
|
||||||
|
For token generation speedup with a draft model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-server -m main-model.gguf \
|
||||||
|
--model-draft draft-model.gguf \
|
||||||
|
-ngl 99 \
|
||||||
|
--draft-max 8 \
|
||||||
|
--draft-min 1 \
|
||||||
|
--no-mmap \
|
||||||
|
-fa
|
||||||
|
```
|
||||||
|
|
||||||
|
**Caveat**: Do NOT combine speculative decoding with KV cache quantization.
|
||||||
|
The 16% performance drop and reduced acceptance rate negate the benefits.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Upcoming llama.cpp Features (2026)
|
||||||
|
|
||||||
|
### 7.1 Backend-Agnostic Tensor Parallelism (PR #19378)
|
||||||
|
|
||||||
|
Merged January 2026. Adds `--split-mode tensor` for splitting computation across
|
||||||
|
multiple GPUs via a new "meta" backend.
|
||||||
|
|
||||||
|
**Relevance to Strix Halo**: Limited. Single integrated GPU. However, for RPC
|
||||||
|
configurations with multiple Strix Halo nodes (Jeff Geerling's Beowulf cluster),
|
||||||
|
tensor parallelism could complement the existing layer-split approach.
|
||||||
|
|
||||||
|
Currently supports 1-2 GPUs with equal data split. `--tensor-split` has no effect yet.
|
||||||
|
|
||||||
|
### 7.2 TurboQuant KV Cache Compression (ICLR 2026)
|
||||||
|
|
||||||
|
Google's TurboQuant (Zandieh et al.) achieves 3-bit KV cache quantization with
|
||||||
|
no training and negligible quality loss:
|
||||||
|
|
||||||
|
| Format | MSE vs FP16 | Compression |
|
||||||
|
|--------|-------------|-------------|
|
||||||
|
| TQ3 (3-bit) | 0.034 | 4.9x |
|
||||||
|
| TQ4 (4-bit) | 0.009 | 3.8x |
|
||||||
|
|
||||||
|
**Timeline**: Open-source llama.cpp integration expected Q2-Q3 2026. A 6-phase
|
||||||
|
integration plan exists covering GGML type registration, KV cache paths, FA
|
||||||
|
integration, and CLI flags.
|
||||||
|
|
||||||
|
### 7.3 Vulkan Improvements
|
||||||
|
|
||||||
|
Active 2025-2026 developments:
|
||||||
|
- Mesa RADV optimizations for RDNA4 AI workloads (Rhys Perry/Valve patches)
|
||||||
|
- 13% pp improvement from CU mode optimization for LDS utilization
|
||||||
|
- BFloat16 Vulkan support (`VK_KHR_shader_bfloat16`) maturing in Mesa 25.x
|
||||||
|
- Partial offloading performance improvement for AMD (llama.cpp b8185, March 2026)
|
||||||
|
|
||||||
|
### 7.4 Flash Attention for Head Dimension 512
|
||||||
|
|
||||||
|
Pull request from March 2026 adds FA support for HD=512 in CUDA kernels.
|
||||||
|
This benefits models with larger head dimensions (some newer architectures).
|
||||||
|
The HIP path should inherit this via hipify.
|
||||||
|
|
||||||
|
### 7.5 ik_llama.cpp Fork Innovations
|
||||||
|
|
||||||
|
The `ik_llama.cpp` fork by ikawrakow introduces:
|
||||||
|
- Row-interleaved quant packing (better memory access patterns)
|
||||||
|
- Smart Expert Reduction for faster MoE inference
|
||||||
|
- Tensor overrides with regex patterns for hybrid GPU/CPU placement
|
||||||
|
- FlashMLA for DeepSeek models
|
||||||
|
|
||||||
|
**Caveat**: ik_llama.cpp only fully supports CPU and CUDA backends. ROCm/Vulkan
|
||||||
|
are not maintained. Not recommended for AMD gfx1151.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Recommended Configurations
|
||||||
|
|
||||||
|
### 8.1 For llama-bench (Benchmarking)
|
||||||
|
|
||||||
|
**ROCm backend:**
|
||||||
|
```bash
|
||||||
|
ROCBLAS_USE_HIPBLASLT=1 \
|
||||||
|
toolbox run -c llama-rocm-7.2 -- \
|
||||||
|
/path/to/llama-bench \
|
||||||
|
-m /path/to/model.gguf \
|
||||||
|
-ngl 99 -mmp 0 -fa 1 \
|
||||||
|
-p 512 -n 128 -r 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**Vulkan backend:**
|
||||||
|
```bash
|
||||||
|
AMD_VULKAN_ICD=RADV \
|
||||||
|
RADV_PERFTEST=nogttspill \
|
||||||
|
toolbox run -c llama-vulkan -- \
|
||||||
|
/path/to/llama-bench \
|
||||||
|
-m /path/to/model.gguf \
|
||||||
|
-ngl 99 -mmp 0 -fa 1 \
|
||||||
|
-p 512 -n 128 -r 5
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.2 For llama-server (Production Agentic Use)
|
||||||
|
|
||||||
|
**ROCm (best for long context):**
|
||||||
|
```bash
|
||||||
|
ROCBLAS_USE_HIPBLASLT=1 \
|
||||||
|
llama-server -m model.gguf \
|
||||||
|
-ngl 99 --no-mmap -fa \
|
||||||
|
-c 65536 -np 4 \
|
||||||
|
-b 2048 -ub 512 \
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0 \
|
||||||
|
--cram 256 \
|
||||||
|
--jinja --cont-batching \
|
||||||
|
--host 0.0.0.0 --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**Vulkan RADV (best for single-user tg):**
|
||||||
|
```bash
|
||||||
|
AMD_VULKAN_ICD=RADV \
|
||||||
|
RADV_PERFTEST=nogttspill \
|
||||||
|
llama-server -m model.gguf \
|
||||||
|
-ngl 99 --no-mmap -fa \
|
||||||
|
-c 32768 -np 2 \
|
||||||
|
-b 4096 -ub 256 \
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0 \
|
||||||
|
--jinja --cont-batching \
|
||||||
|
--host 0.0.0.0 --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.3 Decision Matrix
|
||||||
|
|
||||||
|
| Question | Answer |
|
||||||
|
|----------|--------|
|
||||||
|
| Which backend for benchmarking? | Both. ROCm and Vulkan have different strengths. |
|
||||||
|
| Which backend for daily chat? | Vulkan RADV for best tg speed. |
|
||||||
|
| Which backend for long-context agentic? | ROCm + rocWMMA-tuned for context resilience. |
|
||||||
|
| Which quantization? | Q4_K_M or UD-Q4_K_XL for speed; Q5_K_M for quality. |
|
||||||
|
| Enable flash attention? | Yes, always on ROCm. Yes on Vulkan for short contexts. |
|
||||||
|
| Use `--no-mmap`? | Always. |
|
||||||
|
| Set `ROCBLAS_USE_HIPBLASLT=1`? | Always for ROCm. |
|
||||||
|
| Set `RADV_PERFTEST=nogttspill`? | Always for Vulkan RADV. |
|
||||||
|
| KV cache quantization? | q8_0 for both K and V unless using speculative decoding. |
|
||||||
|
| Batch size for MoE? | `-b 256` (lower than default improves some MoE models). |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Sources
|
||||||
|
|
||||||
|
### GitHub Issues and Discussions
|
||||||
|
- [Performance of llama.cpp on AMD ROCm (HIP) - Discussion #15021](https://github.com/ggml-org/llama.cpp/discussions/15021)
|
||||||
|
- [Performance of llama.cpp with Vulkan - Discussion #10879](https://github.com/ggml-org/llama.cpp/discussions/10879)
|
||||||
|
- [HIP backend performs poorly on gfx1151 - Issue #13565](https://github.com/ggml-org/llama.cpp/issues/13565)
|
||||||
|
- [UMA detection incorrectly limits memory on AMD APUs - Issue #18159](https://github.com/ggml-org/llama.cpp/issues/18159)
|
||||||
|
- [ROCm model loading dumps KV cache to shared memory - Issue #18011](https://github.com/ggml-org/llama.cpp/issues/18011)
|
||||||
|
- [GATED_DELTA_NET underperformance on gfx1151 - Issue #20354](https://github.com/ggml-org/llama.cpp/issues/20354)
|
||||||
|
- [Under-Performance of ROCm 7.2 binaries - Issue #19984](https://github.com/ggml-org/llama.cpp/issues/19984)
|
||||||
|
- [ROCm 7.2 + rocWMMA compilation - Issue #19269](https://github.com/ggml-org/llama.cpp/issues/19269)
|
||||||
|
- [Building for gfx1151 - Issue #14734](https://github.com/ggml-org/llama.cpp/issues/14734)
|
||||||
|
- [AMDVLK 2GB buffer allocation limit - Issue #15054](https://github.com/ggml-org/llama.cpp/issues/15054)
|
||||||
|
- [Mastering Host-Memory Prompt Caching - Discussion #20574](https://github.com/ggml-org/llama.cpp/discussions/20574)
|
||||||
|
- [TurboQuant Extreme KV Cache Quantization - Discussion #20969](https://github.com/ggml-org/llama.cpp/discussions/20969)
|
||||||
|
- [Backend-agnostic tensor parallelism - PR #19378](https://github.com/ggml-org/llama.cpp/pull/19378)
|
||||||
|
- [Massively Improved rocWMMA Performance - PR #16827](https://github.com/ggml-org/llama.cpp/pull/16827)
|
||||||
|
- [rocWMMA for gfx1151 performance boost - lemonade-sdk Issue #7](https://github.com/lemonade-sdk/llamacpp-rocm/issues/7)
|
||||||
|
- [Increase llama.cpp performance on AI Max 395+ - geerlingguy Issue #5](https://github.com/geerlingguy/beowulf-ai-cluster/issues/5)
|
||||||
|
|
||||||
|
### Wiki and Community Resources
|
||||||
|
- [Strix Halo Wiki - llama.cpp Performance](https://strixhalo.wiki/AI/llamacpp-performance)
|
||||||
|
- [Strix Halo Wiki - llama.cpp with ROCm](https://strixhalo.wiki/AI/llamacpp-with-ROCm)
|
||||||
|
- [AMD Strix Halo Backend Benchmarks (Grid View)](https://kyuz0.github.io/amd-strix-halo-toolboxes/)
|
||||||
|
- [LLM Tracker - AMD Strix Halo GPU Performance](https://llm-tracker.info/AMD-Strix-Halo-(Ryzen-AI-Max+-395)-GPU-Performance)
|
||||||
|
- [Framework Community - Strix Halo GPU LLM Performance Tests](https://community.frame.work/t/amd-strix-halo-ryzen-ai-max-395-gpu-llm-performance-tests/72521)
|
||||||
|
- [Framework Community - Toolboxes for LLM inference on Strix Halo](https://community.frame.work/t/llama-cpp-vllm-toolboxes-for-llm-inference-on-strix-halo/74916)
|
||||||
|
|
||||||
|
### Articles and Blog Posts
|
||||||
|
- [Hardware Corner - Strix Halo LLM Optimization](https://www.hardware-corner.net/strix-halo-llm-optimization/)
|
||||||
|
- [Hardware Corner - RADV Vulkan Driver 13% Improvement](https://www.hardware-corner.net/llama-cpp-amd-radv-vulkan-driver-update/)
|
||||||
|
- [Phoronix - AMD ROCm 7.1 vs RADV Vulkan](https://www.phoronix.com/review/rocm-71-llama-cpp-vulkan)
|
||||||
|
- [Phoronix - Valve Developer RADV Improvement](https://www.phoronix.com/news/RADV-Valve-Boost-Llama.cpp)
|
||||||
|
- [Yifei's Blog - Strix Halo Matrix Cores with llama.cpp](https://blog.yifei.sg/jekyll/update/2025/08/27/building-llamacpp-strix-halo.html)
|
||||||
|
- [Strix Halo CUDA/HIP Testing Notes (lhl)](https://github.com/lhl/strix-halo-testing/blob/main/llama-cpp-cuda-hip.md)
|
||||||
|
|
||||||
|
### Official Documentation
|
||||||
|
- [ROCm - llama.cpp compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html)
|
||||||
|
- [ROCm - llama.cpp installation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html)
|
||||||
|
- [ROCm Blog - Llama.cpp Meets Instinct](https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html)
|
||||||
|
- [llama.cpp build documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||||
|
- [llama-server README](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md)
|
||||||
|
|
||||||
|
### Papers
|
||||||
|
- "Which Quantization Should I Use? A Unified Evaluation of llama.cpp Quantization on Llama-3.1-8B-Instruct" (January 2026, arXiv:2601.14277)
|
||||||
|
- "TurboQuant: Redefining AI efficiency with extreme compression" (Zandieh et al., ICLR 2026)
|
||||||
|
- [Unsloth Dynamic 2.0 GGUFs Documentation](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open Questions / Limitations
|
||||||
|
|
||||||
|
1. **rocWMMA-tuned patch upstream status**: PR #16827 may not be fully merged.
|
||||||
|
Monitor for inclusion in mainline llama.cpp or continue using patched toolboxes.
|
||||||
|
|
||||||
|
2. **ROCm 7.2 stability on gfx1151**: Multiple reports of crashes (MUT_MAL errors),
|
||||||
|
performance regressions, and compilation issues. ROCm 7.x is maturing but
|
||||||
|
not fully stable for gfx1151 as of March 2026.
|
||||||
|
|
||||||
|
3. **Vulkan CoopMat FA for AMD**: Will AMD ever get CoopMat2 support? The current
|
||||||
|
CoopMat1 path provides modest improvement. A native AMD CoopMat2 or equivalent
|
||||||
|
extension would close the gap with ROCm FA.
|
||||||
|
|
||||||
|
4. **KV cache placement on ROCm**: Issue #18011 (KV cache dumped to shared memory)
|
||||||
|
reduces ROCm tg performance at high contexts. Root cause appears to be in
|
||||||
|
HIP memory allocation behavior on APUs.
|
||||||
|
|
||||||
|
5. **GGML_HIP_UMA vs kernel-param GTT expansion**: The UMA flag uses slow
|
||||||
|
fine-grained memory. GTT expansion via `amdgpu.gttsize` kernel params provides
|
||||||
|
coarse-grained GPU-mapped memory that is much faster. The upstream approach
|
||||||
|
may eventually improve, but kernel params remain the correct method for now.
|
||||||
|
|
||||||
|
6. **GatedDeltaNet architecture support**: Both Vulkan (missing shader) and ROCm
|
||||||
|
(register pressure, memcpy bottleneck) perform poorly on GDN models. This
|
||||||
|
blocks efficient use of Qwen3.5-27B and similar models.
|
||||||
|
|
||||||
|
7. **TurboQuant integration timeline**: Expected Q2-Q3 2026 for llama.cpp.
|
||||||
|
Would provide 3-bit KV cache with no quality loss, roughly doubling available
|
||||||
|
context within the same memory budget.
|
||||||
|
|
||||||
|
8. **NPU utilization**: The 50 TOPS NPU on Strix Halo is currently Linux-unusable
|
||||||
|
for llama.cpp inference. AMD driver support for NPU on Linux remains pending.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overlap Notes
|
||||||
|
|
||||||
|
- **Kernel parameters** (`amdgpu.gttsize`, `ttm.pages_limit`, `iommu=pt`):
|
||||||
|
Already documented in the project's `scripts/optimize/kernel-params.sh`.
|
||||||
|
This research covers the llama.cpp side (why they matter for inference).
|
||||||
|
|
||||||
|
- **BIOS VRAM allocation**: Reducing dedicated VRAM in BIOS frees more memory
|
||||||
|
for GTT. This is documented in the project's audit scripts but is a prerequisite
|
||||||
|
for the optimizations described here.
|
||||||
|
|
||||||
|
- **Toolbox container builds**: The project uses pre-built toolboxes
|
||||||
|
(`llama-rocm-7.2`, `llama-vulkan`). The compilation flags documented here
|
||||||
|
describe what should be baked into those containers.
|
||||||
246
docs/optimization-log.md
Normal file
246
docs/optimization-log.md
Normal file
@@ -0,0 +1,246 @@
|
|||||||
|
# Optimization Log
|
||||||
|
|
||||||
|
Living document tracking what was applied, tested, and the actual results. Each entry records the change, benchmark evidence, and verdict.
|
||||||
|
|
||||||
|
**Verdicts**: KEEP (applied permanently), REVERTED (tested, didn't help), PENDING (not yet tested), BLOCKED (can't test yet).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1: Core System
|
||||||
|
|
||||||
|
### 1.1 Tuned Profile: accelerator-performance
|
||||||
|
|
||||||
|
- **Date**: 2026-03-26
|
||||||
|
- **Change**: `sudo tuned-adm profile accelerator-performance`
|
||||||
|
- **Benchmark**: `data/benchmarks/after-tuned-*`
|
||||||
|
- **Result**: +5-8% pp improvement, +2-3% tg improvement
|
||||||
|
- **Verdict**: KEEP
|
||||||
|
|
||||||
|
### 1.2 Kernel Boot Parameters
|
||||||
|
|
||||||
|
- **Date**: 2026-03-26
|
||||||
|
- **Change**: `iommu=pt amdgpu.gttsize=60416 ttm.pages_limit=15466496`
|
||||||
|
- **Benchmark**: `data/benchmarks/full-opt-all-models-*`
|
||||||
|
- **Result**: Combined with BIOS VRAM change. Large models now fit in GTT. Peak usage 38.8/59 GiB.
|
||||||
|
- **Verdict**: KEEP
|
||||||
|
|
||||||
|
### 1.3 BIOS VRAM Reduction (512 MB)
|
||||||
|
|
||||||
|
- **Date**: 2026-03-26
|
||||||
|
- **Change**: UMA Frame Buffer Size 32 GB -> 512 MB (HP ZBook F10 BIOS)
|
||||||
|
- **Benchmark**: `data/benchmarks/full-opt-all-models-*`
|
||||||
|
- **Result**: 31.5 GB freed for OS/GTT. Small models ~3-8% slower (GTT indirection vs dedicated VRAM), but system gained ability to run 37 GB+ models at 32K+ context. Net positive.
|
||||||
|
- **Trade-off**: Small model regression is acceptable given the massive capability gain.
|
||||||
|
- **Verdict**: KEEP
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2: System Tuning
|
||||||
|
|
||||||
|
### 2.1 RyzenAdj PPT Increase
|
||||||
|
|
||||||
|
- **Date**: 2026-03-30
|
||||||
|
- **Change**: `sudo ryzenadj --stapm-limit=85000 --fast-limit=85000 --slow-limit=85000 --apu-slow-limit=85000`
|
||||||
|
- **Result**: STAPM raised from 59W→81W. PPT Fast raised to 81W. **However, PPT SLOW and APU SLOW stuck at 70W** — HP ZBook BIOS EC overrides these limits. Effective sustained power: ~70W (was ~59W).
|
||||||
|
- **Benchmark**: `data/benchmarks/qwen35-shootout-v2-*` (Vulkan, q4_0 KV, pp2048/tg1024)
|
||||||
|
- UD-Q4_K_L: **57.0 t/s** (was ~39 t/s before RyzenAdj = **+46%**)
|
||||||
|
- UD-Q4_K_XL: **56.4 t/s**
|
||||||
|
- Q8_0: **51.4 t/s** (was ~39-41 t/s before = **+25%**)
|
||||||
|
- **Thermals**: 70-73C under load, 30C headroom. Cooling handles it easily.
|
||||||
|
- **Notes**: Settings are volatile (reset on reboot/sleep). Use `sudo make optimize-power` or install systemd service for persistence. HP firmware hard-caps slow PPT at 70W regardless.
|
||||||
|
- **Verdict**: KEEP — significant real-world improvement despite HP firmware limit
|
||||||
|
|
||||||
|
### 2.2 VM Sysctl Tuning
|
||||||
|
|
||||||
|
- **Date**: 2026-03-30
|
||||||
|
- **Change**: `vm.swappiness=1, vm.dirty_ratio=40, vm.dirty_background_ratio=10, vm.max_map_count=500000, vm.zone_reclaim_mode=0`
|
||||||
|
- **Applied via**: `sudo make optimize-power` (persists to `/etc/sysctl.d/99-llm-inference.conf`)
|
||||||
|
- **Notes**: Hard to isolate impact — applied together with other Phase 2 changes. Prevents model weight eviction and I/O disruption.
|
||||||
|
- **Verdict**: KEEP — low risk, persists across reboots
|
||||||
|
|
||||||
|
### 2.3 Transparent Huge Pages
|
||||||
|
|
||||||
|
- **Date**: 2026-03-30
|
||||||
|
- **Change**: `echo always > /sys/kernel/mm/transparent_hugepage/enabled`
|
||||||
|
- **Applied via**: `sudo make optimize-power` (volatile — add `transparent_hugepage=always` to kernel cmdline for persistence)
|
||||||
|
- **Notes**: Reduces TLB misses for mmap'd model files. Hard to isolate impact.
|
||||||
|
- **Verdict**: KEEP — low risk
|
||||||
|
|
||||||
|
### 2.4 RADV_PERFTEST=nogttspill
|
||||||
|
|
||||||
|
- **Date**: 2026-03-30
|
||||||
|
- **Change**: `RADV_PERFTEST=nogttspill` persisted to `/etc/environment.d/radv-llm.conf`
|
||||||
|
- **Applied via**: `sudo make optimize-power`
|
||||||
|
- **Notes**: Prevents GTT spill management overhead on unified memory Vulkan. Takes effect on next login. For current session: `export RADV_PERFTEST=nogttspill`
|
||||||
|
- **Verdict**: KEEP — persists across reboots
|
||||||
|
|
||||||
|
### 2.5 amdgpu.noretry=0
|
||||||
|
|
||||||
|
- **Date**: PENDING
|
||||||
|
- **Change**: Kernel cmdline `amdgpu.noretry=0`
|
||||||
|
- **Expected**: Improved stability under memory pressure
|
||||||
|
- **Notes**: Only apply if experiencing GPU page faults or crashes during large model loading
|
||||||
|
- **Verdict**: PENDING
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3: Runtime Flags
|
||||||
|
|
||||||
|
### 3.1 KV Cache Quantization
|
||||||
|
|
||||||
|
- **Date**: 2026-03-27
|
||||||
|
- **Change**: `--kv-types f16,q8_0,q4_0` sweep
|
||||||
|
- **Benchmark**: `data/benchmarks/kv-sweep-256k-*`
|
||||||
|
- **Result** (Vulkan RADV, Qwen3.5-35B-A3B Q8, pp2048/tg1024):
|
||||||
|
- f16: 456 pp, 39.8 tg
|
||||||
|
- q8_0: 418 pp, 38.5 tg (slight Vulkan regression — unexpected)
|
||||||
|
- **q4_0: 460 pp, 41.1 tg** (fastest overall, +3% tg over f16)
|
||||||
|
- **Result** (ROCm, same model):
|
||||||
|
- f16: 445 pp, 21.5 tg
|
||||||
|
- q8_0: 495 pp, 21.7 tg (+11% pp, same tg)
|
||||||
|
- q4_0: 494 pp, 21.8 tg (+11% pp, same tg)
|
||||||
|
- **Conclusion**: q4_0 is the sweet spot on Vulkan (fastest tg + 75% less KV memory). On ROCm, KV quant helps pp but not tg.
|
||||||
|
- **Verdict**: KEEP — use q4_0 KV as default for serving
|
||||||
|
|
||||||
|
### 3.2 MoE Batch Size `-b 256`
|
||||||
|
|
||||||
|
- **Date**: 2026-03-30
|
||||||
|
- **Change**: `-b 256` vs default (2048)
|
||||||
|
- **Benchmark**: `data/benchmarks/batch-default-*` vs `data/benchmarks/batch-256-*`
|
||||||
|
- **Result** (Vulkan RADV, Qwen3.5-35B-A3B UD-Q4_K_XL, q4_0 KV):
|
||||||
|
- Default: 826 pp, 55.9 tg
|
||||||
|
- b=256: 843 pp, 55.5 tg (within noise)
|
||||||
|
- **Notes**: Community-reported +70% improvement does not reproduce on Vulkan RADV. May only apply to ROCm or CPU backends, or to longer prompts (pp8192+).
|
||||||
|
- **Verdict**: NO IMPACT on Vulkan — not recommended
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4: Build Optimizations
|
||||||
|
|
||||||
|
### 4.1 rocWMMA Flash Attention
|
||||||
|
|
||||||
|
- **Date**: PENDING
|
||||||
|
- **Change**: Rebuild ROCm toolbox with `-DGGML_HIP_ROCWMMA_FATTN=ON -DGGML_HIP_UMA=ON`
|
||||||
|
- **Expected**: +96% long-context performance (65K+)
|
||||||
|
- **Notes**: Need to check if Donato's toolboxes already include this
|
||||||
|
- **Verdict**: PENDING
|
||||||
|
|
||||||
|
### 4.2 rocWMMA Tuned Patch (PR #16827)
|
||||||
|
|
||||||
|
- **Date**: PENDING
|
||||||
|
- **Notes**: Fixes long-context regression. Check Donato's latest toolbox builds.
|
||||||
|
- **Verdict**: PENDING
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5: Future / Blocked
|
||||||
|
|
||||||
|
### 5.1 Speculative Decoding (draft model)
|
||||||
|
|
||||||
|
- **Status**: BLOCKED — llama.cpp PR #20075 (hybrid SSM/MoE checkpoint/restore)
|
||||||
|
- **Draft model**: Downloaded `Qwen3.5-0.8B-Q8_0.gguf` (812 MB) on 2026-03-27
|
||||||
|
- **Last checked**: 2026-03-30 — PR stalled since Mar 5. ROCm buffer crashes in `copy_cell()`. Works on Metal/CUDA but not AMD. Months away from landing.
|
||||||
|
|
||||||
|
### 5.2 Native MTP (Multi-Token Prediction)
|
||||||
|
|
||||||
|
- **Status**: BLOCKED — multiple dependencies unmerged
|
||||||
|
- **Last checked**: 2026-03-30
|
||||||
|
- **Details**: 4 separate PRs in flight, none merged:
|
||||||
|
- PR #18886: MTP API framework (DRAFT since Feb 6) — foundation for all MTP work
|
||||||
|
- PR #20700: MTP for Qwen3.5 **dense only** (WIP, author says "not expected to merge soon")
|
||||||
|
- PR #15225: GLM-style MTP (open since Aug 2025, "slower than baseline")
|
||||||
|
- PR #18039: EAGLE3 speculative (open since Dec 2025)
|
||||||
|
- **Key gap**: No MTP implementation exists for MoE models. PR #20700 only covers dense Qwen3.5 (0.8B-27B), not the 35B-A3B MoE.
|
||||||
|
- **Timeline estimate**: MTP API (#18886) must merge first, then model-specific implementations adapted. Months, not weeks.
|
||||||
|
|
||||||
|
### 5.2a N-gram Speculative Decoding (AVAILABLE NOW)
|
||||||
|
|
||||||
|
- **Status**: WORKS TODAY — no upstream PRs needed
|
||||||
|
- **How**: `llama-server --spec-type ngram-simple --draft-max 64 --draft-min 4`
|
||||||
|
- **Expected**: 1.1-1.4x tg speedup on repetitive content (code, structured output)
|
||||||
|
- **Added to**: `make serve-ngram ARGS="-m MODEL.gguf"` and `bin/serve --ngram`
|
||||||
|
- **Notes**: Pattern-matches from token history, no draft model needed. Best for code generation where patterns repeat. No quality impact.
|
||||||
|
- **Verdict**: AVAILABLE — use `--ngram` flag when serving
|
||||||
|
|
||||||
|
### 5.3 GPU Clock Reporting
|
||||||
|
|
||||||
|
- **Status**: NOT A REAL ISSUE — sysfs reporting is broken, actual clocks are fine
|
||||||
|
- **Measured**: clpeak (2026-03-30) confirms GPU reaches 2900 MHz under compute load
|
||||||
|
- **Notes**: ROCm issue #5750 is about sysfs `pp_dpm_sclk` reporting, not actual performance. No action needed.
|
||||||
|
- **Verdict**: CLOSED — no performance impact
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context Window Benchmarks
|
||||||
|
|
||||||
|
### 64K Context (pp4096/tg1024, MoE models)
|
||||||
|
|
||||||
|
- **Date**: 2026-03-26
|
||||||
|
- **Benchmark**: `data/benchmarks/ctx64k-*`
|
||||||
|
- **Results**: (check logs)
|
||||||
|
|
||||||
|
### 128K Context (pp8192/tg1024, MoE models)
|
||||||
|
|
||||||
|
- **Date**: 2026-03-26
|
||||||
|
- **Benchmark**: `data/benchmarks/ctx128k-realistic-*`
|
||||||
|
- **Results**: (check logs)
|
||||||
|
|
||||||
|
### 256K Context (pp16384/tg1024, MoE models)
|
||||||
|
|
||||||
|
- **Date**: 2026-03-27
|
||||||
|
- **Benchmark**: `data/benchmarks/ctx256k-*`
|
||||||
|
- **Results**: (check logs)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Quant Shootout
|
||||||
|
|
||||||
|
### Qwen3.5-35B-A3B — Q4_K_L vs Q4_K_XL vs Q8 (2026-03-30)
|
||||||
|
|
||||||
|
- **Benchmark**: `data/benchmarks/qwen35-shootout-v2-*`
|
||||||
|
- **Config**: Vulkan RADV, q4_0 KV cache, pp2048/tg1024, 2 reps
|
||||||
|
- **RyzenAdj**: STAPM=81W (sustained ~70W due to HP firmware cap)
|
||||||
|
|
||||||
|
| Quant | File Size | pp2048 (t/s) | tg1024 (t/s) | Recommendation |
|
||||||
|
|-------|-----------|-------------|-------------|----------------|
|
||||||
|
| UD-Q4_K_L | 18.8 GB | 825 | **57.0** | Fastest. Good quality. |
|
||||||
|
| **UD-Q4_K_XL** | 20.7 GB | 835 | **56.4** | **Daily driver** — best quality/speed. |
|
||||||
|
| Q8_0 | 34.4 GB | 850 | 51.4 | Best quality, 10% slower tg. |
|
||||||
|
|
||||||
|
**Decision**: Keep UD-Q4_K_XL (daily driver) and Q8_0 (quality fallback). Q4_K_L deleted — Q4_K_XL is strictly better at only +2 GB.
|
||||||
|
|
||||||
|
### Coder Model Shootout (2026-03-30)
|
||||||
|
|
||||||
|
- **Benchmark**: `data/benchmarks/coder-shootout-*`
|
||||||
|
- **Config**: Vulkan RADV, q4_0 KV cache, pp2048/tg1024, 2 reps
|
||||||
|
- **RyzenAdj**: STAPM=81W (sustained ~70W)
|
||||||
|
|
||||||
|
| Model | Architecture | File Size | pp2048 (t/s) | tg1024 (t/s) |
|
||||||
|
|-------|-------------|-----------|-------------|-------------|
|
||||||
|
| **Qwen3-Coder-30B** UD-Q6_K_XL | Pure Transformer | 24.5 GB | 737 | **61.0** |
|
||||||
|
| **Qwen3.5-35B-A3B** UD-Q4_K_XL | Hybrid DeltaNet | 20.7 GB | **821** | 54.9 |
|
||||||
|
| **Nemotron-Cascade-2** Q8_0 | Hybrid Mamba-2 | 31.3 GB | 643 | 52.8 |
|
||||||
|
| **Qwen3-Coder-Next** UD-Q3_K_XL | Hybrid DeltaNet | 33.8 GB | 545 | 46.8 |
|
||||||
|
|
||||||
|
**Analysis**:
|
||||||
|
- tg speed scales inversely with model size (bandwidth-bound at ~215 GB/s)
|
||||||
|
- Pure Transformer (Qwen3-Coder-30B) has lowest overhead per token
|
||||||
|
- DeltaNet hybrid (Qwen3.5) has best pp — DeltaNet layers are efficient for prefill
|
||||||
|
- Qwen3-Coder-Next (80B at 3-bit) is 25% slower tg but has >70% SWE-bench vs ~50% for the 30B
|
||||||
|
|
||||||
|
**Recommended roles**:
|
||||||
|
- **Qwen3-Coder-30B**: Interactive tool-use / function-calling loops (fastest tg, purpose-built)
|
||||||
|
- **Qwen3.5-35B-A3B**: General tasks, long prompt processing (best pp, best all-rounder)
|
||||||
|
- **Qwen3-Coder-Next**: Complex multi-file coding tasks where quality > speed
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How to Add Entries
|
||||||
|
|
||||||
|
When testing a new optimization:
|
||||||
|
|
||||||
|
1. Record the date and exact change
|
||||||
|
2. Run a benchmark: `make benchmark ARGS="--tag DESCRIPTIVE-NAME ..."`
|
||||||
|
3. Compare: `make benchmark-compare BEFORE=data/path/baseline AFTER=data/path/new`
|
||||||
|
4. Update this log with results and verdict
|
||||||
|
5. If KEEP: document in [optimization.md](optimization.md) with the measured numbers
|
||||||
@@ -1,84 +1,275 @@
|
|||||||
# Optimization Guide
|
# Optimization Guide
|
||||||
|
|
||||||
Complete walkthrough for optimizing AMD Strix Halo for LLM workloads.
|
Complete walkthrough for optimizing AMD Strix Halo for LLM inference workloads. Organized in phases from essential to experimental. Each phase builds on the previous.
|
||||||
|
|
||||||
**Prerequisites**: Run `make audit` first to see your current state. Run `make benchmark-baseline` to capture pre-optimization performance numbers.
|
**Prerequisites**: Run `make audit` first to see your current state. Run `make benchmark-baseline` to capture pre-optimization performance numbers.
|
||||||
|
|
||||||
## Step 1: Tuned Profile (no reboot)
|
Track results in [optimization-log.md](optimization-log.md) as you apply each change.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1: Core System (automated scripts)
|
||||||
|
|
||||||
|
These are the foundational optimizations handled by this repo's scripts. Apply in order.
|
||||||
|
|
||||||
|
### 1.1 Tuned Profile (no reboot)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo make optimize-tuned
|
sudo make optimize-tuned
|
||||||
```
|
```
|
||||||
|
|
||||||
Switches from `throughput-performance` to `accelerator-performance`, which disables higher-latency CPU STOP states. Provides 5-8% improvement in prompt processing throughput.
|
Switches to `accelerator-performance`: disables higher-latency CPU STOP states, sets CPU governor to performance.
|
||||||
|
|
||||||
Takes effect immediately. Previous profile is saved for rollback.
|
**Measured**: +5-8% pp, +2-3% tg.
|
||||||
|
|
||||||
## Step 2: Kernel Boot Parameters (reboot required)
|
### 1.2 Kernel Boot Parameters (reboot required)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo make optimize-kernel
|
sudo make optimize-kernel
|
||||||
```
|
```
|
||||||
|
|
||||||
Adds three parameters to GRUB:
|
|
||||||
|
|
||||||
| Parameter | Value (64 GB) | Purpose |
|
| Parameter | Value (64 GB) | Purpose |
|
||||||
|-----------|--------------|---------|
|
|-----------|--------------|---------|
|
||||||
| `iommu=pt` | — | IOMMU passthrough, reduces memory access latency |
|
| `iommu=pt` | -- | IOMMU passthrough, reduces memory access latency |
|
||||||
| `amdgpu.gttsize` | `60416` | Max GPU-addressable system RAM in MiB |
|
| `amdgpu.gttsize` | `60416` | Max GPU-addressable system RAM in MiB (~59 GiB) |
|
||||||
| `ttm.pages_limit` | `15466496` | Max pinnable 4K pages for GPU memory |
|
| `ttm.pages_limit` | `15466496` | Max pinnable 4K pages for GPU memory |
|
||||||
|
|
||||||
Values are computed dynamically based on your system's total physical RAM. The script backs up `/etc/default/grub` before modifying it.
|
Values computed dynamically from total physical RAM. See [architecture.md](architecture.md) for the math.
|
||||||
|
|
||||||
See [docs/architecture.md](architecture.md) for the math behind these values.
|
### 1.3 BIOS VRAM Reduction (reboot + BIOS access)
|
||||||
|
|
||||||
## Step 3: BIOS VRAM Reduction (reboot + BIOS access)
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make optimize-vram
|
make optimize-vram # Prints guidance — cannot modify BIOS directly
|
||||||
```
|
```
|
||||||
|
|
||||||
This prints guidance — it cannot modify BIOS directly. The goal is to reduce dedicated VRAM from 32 GB to 0.5 GB, freeing 31.5 GB back to the OS for dynamic GPU access via GTT.
|
Reduce UMA Frame Buffer Size from 32 GB to 512 MB. Frees 31.5 GB for GTT. See [bios-vram-guide.md](bios-vram-guide.md) (HP ZBook: F10 at boot).
|
||||||
|
|
||||||
See [docs/bios-vram-guide.md](bios-vram-guide.md) for the full BIOS walkthrough.
|
**Combine 1.2 and 1.3 into a single reboot.**
|
||||||
|
|
||||||
**Combine Steps 2 and 3 into a single reboot**: apply kernel params, then reboot into BIOS (F10) to change VRAM, then boot normally.
|
### 1.4 Verify
|
||||||
|
|
||||||
## Step 4: Verify
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make verify
|
make verify # 9-point checklist, target: 9/9
|
||||||
|
make audit # Single-screen system status with scores
|
||||||
```
|
```
|
||||||
|
|
||||||
Checks 9 criteria and reports a score. Target: 9/9.
|
### Phase 1 Measured Impact
|
||||||
|
|
||||||
## Step 5: Measure Impact
|
| Optimization | pp | tg |
|
||||||
|
|-------------|----|----|
|
||||||
```bash
|
|
||||||
make benchmark
|
|
||||||
make benchmark-compare BEFORE=data/baselines/TIMESTAMP AFTER=data/benchmarks/TAG-TIMESTAMP
|
|
||||||
```
|
|
||||||
|
|
||||||
See [docs/benchmarking.md](benchmarking.md) for methodology and result interpretation.
|
|
||||||
|
|
||||||
## Expected Impact
|
|
||||||
|
|
||||||
| Optimization | pp512 Improvement | tg128 Improvement |
|
|
||||||
|-------------|-------------------|-------------------|
|
|
||||||
| Tuned profile | +5-8% | +2-3% |
|
| Tuned profile | +5-8% | +2-3% |
|
||||||
| Kernel params + BIOS VRAM | +10-20% | +5-15% |
|
| Kernel params + BIOS VRAM | Enables 37 GB+ models | +5-15% |
|
||||||
| **Combined** | **+15-25%** | **+8-18%** |
|
| **Phase 1 combined** | **+15-25%** | **+8-18%** |
|
||||||
|
|
||||||
Numbers vary by model size and backend. Larger models see bigger gains from GTT expansion.
|
Trade-off: Small models (<5 GB) are ~3-8% slower due to GTT indirection vs dedicated VRAM. Acceptable given the massive capability gain.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2: System Tuning
|
||||||
|
|
||||||
|
All Phase 2 optimizations are applied with a single command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo make optimize-power
|
||||||
|
```
|
||||||
|
|
||||||
|
This applies RyzenAdj, sysctl, THP, and RADV nogttspill. Sysctl and nogttspill persist across reboots. RyzenAdj and THP are volatile.
|
||||||
|
|
||||||
|
For RyzenAdj persistence across reboots:
|
||||||
|
```bash
|
||||||
|
sudo cp configs/ryzenadj-llm.service configs/ryzenadj-resume.service /etc/systemd/system/
|
||||||
|
sudo systemctl enable --now ryzenadj-llm.service
|
||||||
|
sudo systemctl enable ryzenadj-resume.service
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.1 RyzenAdj PPT Increase
|
||||||
|
|
||||||
|
The HP ZBook Ultra G1a ships at 59W sustained. RyzenAdj raises this.
|
||||||
|
|
||||||
|
**Measured on HP ZBook**: STAPM raised to 81W, but **HP firmware hard-caps PPT SLOW at 70W**. Effective sustained power: ~70W (was ~59W). Cannot be overridden without modded BIOS.
|
||||||
|
|
||||||
|
**Measured impact**: Qwen3.5-35B-A3B tg1024 went from **~39 t/s → 57 t/s (+46%)**. This was the single largest improvement in the entire optimization journey.
|
||||||
|
|
||||||
|
Thermals: 70-73C under sustained load. 30C headroom. Cooling handles it easily.
|
||||||
|
|
||||||
|
### 2.2 VM / Sysctl Tuning
|
||||||
|
|
||||||
|
Persisted to `/etc/sysctl.d/99-llm-inference.conf`:
|
||||||
|
|
||||||
|
| Parameter | Default | Set To | Why |
|
||||||
|
|-----------|---------|--------|-----|
|
||||||
|
| `vm.swappiness` | 60 | **1** | Prevent model weight eviction |
|
||||||
|
| `vm.dirty_ratio` | 20 | **40** | Reduce I/O flush storms during inference |
|
||||||
|
| `vm.max_map_count` | 65530 | **500000** | Large models need many memory mappings |
|
||||||
|
| `vm.zone_reclaim_mode` | 0 | **0** | Don't aggressively reclaim memory zones |
|
||||||
|
|
||||||
|
### 2.3 Transparent Huge Pages
|
||||||
|
|
||||||
|
Set to `always`. Reduces TLB misses for mmap'd model files. Volatile — add `transparent_hugepage=always` to kernel cmdline for persistence.
|
||||||
|
|
||||||
|
### 2.4 RADV_PERFTEST=nogttspill
|
||||||
|
|
||||||
|
Persisted to `/etc/environment.d/radv-llm.conf`. Prevents GTT spill management overhead on unified memory. Vulkan RADV only.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3: Runtime Flags (per-invocation)
|
||||||
|
|
||||||
|
These flags should be used when running llama-bench, llama-server, or llama-cli. Already set in this repo's benchmark scripts.
|
||||||
|
|
||||||
|
### 3.1 `-mmp 0` (no mmap) — mandatory
|
||||||
|
|
||||||
|
On unified memory, mmap adds a double-copy penalty. Always disable.
|
||||||
|
|
||||||
|
### 3.2 KV Cache Quantization — use Q4_0
|
||||||
|
|
||||||
|
**Measured** (Vulkan RADV, Qwen3.5-35B-A3B):
|
||||||
|
|
||||||
|
| KV Type | pp2048 | tg1024 | Memory Savings |
|
||||||
|
|---------|--------|--------|---------------|
|
||||||
|
| f16 | 456 | 39.8 | Baseline |
|
||||||
|
| q8_0 | 418 | 38.5 | ~50% (slightly slower on Vulkan) |
|
||||||
|
| **q4_0** | **460** | **41.1** | **75% (fastest on Vulkan)** |
|
||||||
|
|
||||||
|
Q4_0 is faster because less memory bandwidth is spent reading KV cache. Use as default for serving. Quality impact is noticeable only on reasoning-heavy tasks.
|
||||||
|
|
||||||
|
### 3.3 Flash Attention (`-fa 1`) — always enable
|
||||||
|
|
||||||
|
+24% pp on ROCm. Modest improvement on Vulkan (CoopMat1). Already in benchmark scripts.
|
||||||
|
|
||||||
|
### 3.4 `ROCBLAS_USE_HIPBLASLT=1` (ROCm only) — mandatory
|
||||||
|
|
||||||
|
Without this, ROCm pp is 2-7x slower on gfx1151. Already in benchmark scripts.
|
||||||
|
|
||||||
|
### 3.5 Backend Selection
|
||||||
|
|
||||||
|
**Measured** (Qwen3.5-35B-A3B Q8, 128K context):
|
||||||
|
|
||||||
|
| Workload | Vulkan RADV | ROCm 7.2 | Winner |
|
||||||
|
|----------|------------|----------|--------|
|
||||||
|
| pp2048 | 456 | 445 | Vulkan (+2%) |
|
||||||
|
| tg1024 | 39.8 | 21.5 | **Vulkan (1.9x)** |
|
||||||
|
| pp8192 @ 131K | 130 | 84 | **Vulkan (1.5x)** |
|
||||||
|
| tg32 @ 131K | 22.0 | 8.1 | **Vulkan (2.7x)** |
|
||||||
|
|
||||||
|
**Vulkan RADV dominates across all workloads on this hardware.** ROCm is significantly slower for token generation, especially at long context. Never use AMDVLK.
|
||||||
|
|
||||||
|
### 3.6 MoE Batch Size `-b 256` — NOT YET TESTED
|
||||||
|
|
||||||
|
Community reports up to +70% pp improvement for MoE models. Default batch (2048) may be too large. Needs benchmarking.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4: Build Optimizations (not yet tested)
|
||||||
|
|
||||||
|
These require rebuilding the llama.cpp toolbox containers. Given that **Vulkan RADV already outperforms ROCm significantly**, the ROI of these ROCm-specific optimizations is unclear.
|
||||||
|
|
||||||
|
### 4.1 ROCm Build Flags
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_HIP=ON \
|
||||||
|
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||||
|
-DGGML_HIP_UMA=ON \
|
||||||
|
-DAMDGPU_TARGETS=gfx1151
|
||||||
|
```
|
||||||
|
|
||||||
|
`GGML_HIP_ROCWMMA_FATTN` enables GPU-accelerated flash attention via WMMA. Could close the gap between ROCm and Vulkan for long-context workloads. Check if Donato Capitella's ROCm toolboxes already include this.
|
||||||
|
|
||||||
|
### 4.2 Vulkan Cooperative Matrices
|
||||||
|
|
||||||
|
RADV supports `VK_KHR_cooperative_matrix` for RDNA 3+. Already used by llama.cpp for matrix operations. The current Vulkan toolbox shows `matrix cores: KHR_coopmat` — this is likely already active.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5: Future / Currently Blocked
|
||||||
|
|
||||||
|
### 5.1 Speculative Decoding
|
||||||
|
|
||||||
|
Expected 1.8-2.5x tg speedup. Draft model (`Qwen3.5-0.8B-Q8_0.gguf`) downloaded.
|
||||||
|
|
||||||
|
**Blocked**: llama.cpp PR #20075 — hybrid SSM/MoE speculative rollback.
|
||||||
|
|
||||||
|
### 5.2 Native Multi-Token Prediction
|
||||||
|
|
||||||
|
Qwen3.5 has built-in MTP heads. No draft model needed.
|
||||||
|
|
||||||
|
**Blocked**: llama.cpp PR #20700 — MTP for Qwen3.5.
|
||||||
|
|
||||||
|
### 5.3 GPU Clock Reporting Fix
|
||||||
|
|
||||||
|
ROCm issue #5750 reports GPU clocks appearing stuck at ~885 MHz in sysfs. However, **clpeak confirms clocks reach 2900 MHz under compute load** (measured 2026-03-30). The issue is likely broken sysfs reporting, not actual clock throttling. No performance impact.
|
||||||
|
|
||||||
|
**Tracking**: ROCm issue #5750 (sysfs reporting only, not a real blocker).
|
||||||
|
|
||||||
|
### 5.4 Other Future Items
|
||||||
|
|
||||||
|
- **SageAttention**: 2-5x over FlashAttention. No AMD port yet.
|
||||||
|
- **XDNA NPU** (50 TOPS): Linux support coming in kernel 7.1. Could run draft model for speculative decoding.
|
||||||
|
- **TurboQuant 3-bit KV** (ICLR 2026): 4.9x KV compression. Being integrated into llama.cpp.
|
||||||
|
- **LLMLingua-2**: 20x prompt compression for agentic/RAG workloads.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hardware Limits (measured via clpeak on this system, 2026-03-30)
|
||||||
|
|
||||||
|
| Resource | Value | Notes |
|
||||||
|
|----------|-------|-------|
|
||||||
|
| DRAM bandwidth | **216-233 GB/s** | clpeak float: 215.7, float16: 232.6. Theoretical max: 256 GB/s. |
|
||||||
|
| Infinity Cache | **32 MB, ~1 TB/s** | MoE effective throughput exceeds DRAM BW due to cache hits |
|
||||||
|
| GPU clocks | **2900 MHz confirmed** | clpeak shows clocks reach 2900 MHz under load. ROCm #5750 sysfs reporting may be broken but actual clocks are correct. |
|
||||||
|
| FP16 compute | **21.9 TFLOPS** | 20 CUs x 2900 MHz |
|
||||||
|
| FP32 compute | **12.3 TFLOPS** | |
|
||||||
|
| LPDDR5X-8000 | 8000 MT/s, 256-bit | Soldered, no overclocking |
|
||||||
|
| Infinity Fabric | 2 GHz FCLK | Fixed |
|
||||||
|
| HP ZBook sustained power | **70W** (firmware cap) | RyzenAdj can set 85W but HP overrides to 70W |
|
||||||
|
|
||||||
|
**Note on MoE bandwidth**: MoE models (3B active params) read ~13.5 GB per token at Q4. At 55 t/s this implies ~740 GB/s effective throughput — well above the 216 GB/s DRAM ceiling. The 32 MB Infinity Cache (~1 TB/s) is actively boosting throughput for repeated KV cache and activation reads.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Summary (all measured, Vulkan RADV, q4_0 KV)
|
||||||
|
|
||||||
|
### Model Shootout (pp2048/tg1024, Phase 1+2 applied)
|
||||||
|
|
||||||
|
| Model | Arch | Size | pp2048 | tg1024 |
|
||||||
|
|-------|------|------|--------|--------|
|
||||||
|
| **Qwen3-Coder-30B** UD-Q6_K_XL | Pure Transformer | 24.5 GB | 737 | **61.0** |
|
||||||
|
| **Qwen3.5-35B-A3B** UD-Q4_K_XL | Hybrid DeltaNet | 20.7 GB | **821** | 54.9 |
|
||||||
|
| **Nemotron-Cascade-2** Q8_0 | Hybrid Mamba-2 | 31.3 GB | 643 | 52.8 |
|
||||||
|
| **Qwen3-Coder-Next** UD-Q3_K_XL | Hybrid DeltaNet | 33.8 GB | 545 | 46.8 |
|
||||||
|
|
||||||
|
tg speed is bandwidth-bound: smaller model = faster tokens.
|
||||||
|
|
||||||
|
### Optimization Journey (Qwen3.5-35B-A3B, tg on Vulkan)
|
||||||
|
|
||||||
|
| Stage | tg (t/s) | Improvement |
|
||||||
|
|-------|----------|-------------|
|
||||||
|
| Pre-optimization (stock) | ~33 | Baseline |
|
||||||
|
| After Phase 1 (tuned + kernel + BIOS) | ~39 | +18% |
|
||||||
|
| After Phase 2 (RyzenAdj + sysctl + THP) | **~57** | **+46%** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Rollback
|
## Rollback
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo make rollback
|
sudo make rollback # Restores GRUB backup and previous tuned profile
|
||||||
```
|
```
|
||||||
|
|
||||||
Restores GRUB backup and previous tuned profile. BIOS VRAM must be reverted manually (F10 → restore previous UMA Frame Buffer Size).
|
Phase 2 rollback:
|
||||||
|
- RyzenAdj: `sudo ryzenadj --stapm-limit=60000 --fast-limit=60000 --slow-limit=60000`
|
||||||
|
- Sysctl: `sudo rm /etc/sysctl.d/99-llm-inference.conf && sudo sysctl --system`
|
||||||
|
- THP: `echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled`
|
||||||
|
- nogttspill: `sudo rm /etc/environment.d/radv-llm.conf`
|
||||||
|
- BIOS VRAM: revert manually (F10 at boot)
|
||||||
|
|
||||||
## Troubleshooting
|
---
|
||||||
|
|
||||||
If anything goes wrong, see [docs/troubleshooting.md](troubleshooting.md).
|
## Further Reading
|
||||||
|
|
||||||
|
- [Optimization log](optimization-log.md) — Detailed test results and verdicts
|
||||||
|
- [Hardware analysis](llama-cpp-optimization-research.md) — llama.cpp flags, backends, quantization deep dive
|
||||||
|
- [Inference landscape](inference-optimization-landscape.md) — Broader survey of engines and techniques
|
||||||
|
- [Benchmarking guide](benchmarking.md) — Methodology and result interpretation
|
||||||
|
- [References](references.md) — All external links
|
||||||
|
|||||||
@@ -21,8 +21,13 @@ The most comprehensive community resource for Strix Halo LLM optimization.
|
|||||||
## Community
|
## Community
|
||||||
|
|
||||||
- [Strix Halo Wiki — AI Capabilities](https://strixhalo.wiki/AI/AI_Capabilities_Overview) — Community benchmarks, model compatibility
|
- [Strix Halo Wiki — AI Capabilities](https://strixhalo.wiki/AI/AI_Capabilities_Overview) — Community benchmarks, model compatibility
|
||||||
|
- [Strix Halo Wiki — Power Modes](https://strixhalo.wiki/Guides/Power-Modes-and-Performance) — RyzenAdj sweet spots (85W recommended)
|
||||||
|
- [Strix Halo Wiki — llama.cpp Performance](https://strixhalo.wiki/AI/llamacpp-performance) — Backend comparison data
|
||||||
- [Level1Techs Forum — HP G1a Guide](https://forum.level1techs.com/t/the-ultimate-arch-secureboot-guide-for-ryzen-ai-max-ft-hp-g1a-128gb-8060s-monster-laptop/230652) — Laptop-specific configuration
|
- [Level1Techs Forum — HP G1a Guide](https://forum.level1techs.com/t/the-ultimate-arch-secureboot-guide-for-ryzen-ai-max-ft-hp-g1a-128gb-8060s-monster-laptop/230652) — Laptop-specific configuration
|
||||||
- [Framework Community — GPU Performance Tests](https://community.frame.work/t/amd-strix-halo-ryzen-ai-max-395-gpu-llm-performance-tests/72521) — Framework Desktop results
|
- [Framework Community — GPU Performance Tests](https://community.frame.work/t/amd-strix-halo-ryzen-ai-max-395-gpu-llm-performance-tests/72521) — Framework Desktop results
|
||||||
|
- [Framework Community — Compiling vLLM on Strix Halo](https://community.frame.work/t/how-to-compiling-vllm-from-source-on-strix-halo/77241) — Native vLLM build guide
|
||||||
|
- [Hardware Corner — Strix Halo LLM Optimization](https://www.hardware-corner.net/strix-halo-llm-optimization/) — Comprehensive optimization walkthrough
|
||||||
|
- [Chips and Cheese — Strix Halo Memory Subsystem](https://chipsandcheese.com/p/strix-halos-memory-subsystem-tackling) — Bandwidth measurements (215 GB/s)
|
||||||
- [LLM Tracker — Strix Halo](https://llm-tracker.info/_TOORG/Strix-Halo) — Centralized performance database
|
- [LLM Tracker — Strix Halo](https://llm-tracker.info/_TOORG/Strix-Halo) — Centralized performance database
|
||||||
|
|
||||||
## Other Strix Halo Repos
|
## Other Strix Halo Repos
|
||||||
@@ -61,6 +66,22 @@ The most comprehensive community resource for Strix Halo LLM optimization.
|
|||||||
- [SWE-bench](https://github.com/princeton-nlp/SWE-bench) — Real GitHub issue resolution
|
- [SWE-bench](https://github.com/princeton-nlp/SWE-bench) — Real GitHub issue resolution
|
||||||
- [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) — Optimized agentic framework for Qwen models
|
- [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) — Optimized agentic framework for Qwen models
|
||||||
|
|
||||||
|
## System Tuning
|
||||||
|
|
||||||
|
- [RyzenAdj](https://github.com/FlyGoat/RyzenAdj) — Power management for Ryzen APUs (PPT/TDP control)
|
||||||
|
- [geohot/ztop](https://github.com/geohot/ztop) — Power monitoring for Strix Halo (discovered 60W HP limits)
|
||||||
|
- [ROCm Issue #5750](https://github.com/ROCm/ROCm/issues/5750) — GPU clocks stuck at idle on gfx1151
|
||||||
|
- [Mesa RADV Environment Variables](https://docs.mesa3d.org/envvars.html) — RADV_PERFTEST=nogttspill docs
|
||||||
|
- [Linux Kernel: amd-pstate](https://docs.kernel.org/admin-guide/pm/amd-pstate.html) — CPU power management
|
||||||
|
|
||||||
|
## llama.cpp Optimization
|
||||||
|
|
||||||
|
- [llama.cpp Speculative Decoding](https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md) — Draft model setup
|
||||||
|
- [llama.cpp PR #20075](https://github.com/ggml-org/llama.cpp/pull/20075) — Fix speculative for Qwen3.5 MoE (pending)
|
||||||
|
- [llama.cpp PR #20700](https://github.com/ggml-org/llama.cpp/pull/20700) — Native MTP for Qwen3.5 (WIP)
|
||||||
|
- [llama.cpp PR #16827](https://github.com/ggml-org/llama.cpp/pull/16827) — rocWMMA tuned flash attention
|
||||||
|
- [llama.cpp Issue #12444](https://github.com/ggml-org/llama.cpp/issues/12444) — Hugepage support proposal
|
||||||
|
|
||||||
## AMD GPU Profiling
|
## AMD GPU Profiling
|
||||||
|
|
||||||
- [Radeon GPU Profiler (RGP)](https://gpuopen.com/rgp/) — Hardware-level Vulkan/HIP profiling
|
- [Radeon GPU Profiler (RGP)](https://gpuopen.com/rgp/) — Hardware-level Vulkan/HIP profiling
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -31,6 +33,8 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
-b|--batch) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-baseline.sh [OPTIONS]"
|
echo "Usage: run-baseline.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -42,11 +46,16 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
|
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
|
||||||
echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE"
|
echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE"
|
||||||
echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic"
|
echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic"
|
||||||
|
echo " run-baseline.sh --kv-types f16,q8_0,q4_0 --context 131072 # KV sweep"
|
||||||
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
|
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
|
||||||
exit 0 ;;
|
exit 0 ;;
|
||||||
*) log_warn "Unknown argument: $1"; shift ;;
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
@@ -59,11 +68,19 @@ if (( CTX_DEPTH > 32768 )); then
|
|||||||
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Parse KV cache types for sweep
|
||||||
|
if [[ -n "$KV_TYPES_RAW" ]]; then
|
||||||
|
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
|
||||||
|
else
|
||||||
|
KV_TYPES=("f16")
|
||||||
|
fi
|
||||||
|
|
||||||
log_header "Baseline Benchmark Capture"
|
log_header "Baseline Benchmark Capture"
|
||||||
log_info "Results will be saved to: $RESULT_DIR"
|
log_info "Results will be saved to: $RESULT_DIR"
|
||||||
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
|
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
|
||||||
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
|
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
|
||||||
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
|
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
|
||||||
|
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
|
||||||
|
|
||||||
# ── 1. Save system state ────────────────────────────────
|
# ── 1. Save system state ────────────────────────────────
|
||||||
log_info "Capturing system state..."
|
log_info "Capturing system state..."
|
||||||
@@ -165,9 +182,8 @@ log_info "Metric logger started (PID: $METRICS_PID)"
|
|||||||
cleanup() {
|
cleanup() {
|
||||||
kill "$METRICS_PID" 2>/dev/null || true
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
wait "$METRICS_PID" 2>/dev/null || true
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
return 0
|
|
||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# ── 5. Run benchmarks ───────────────────────────────────
|
# ── 5. Run benchmarks ───────────────────────────────────
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
@@ -189,56 +205,85 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Standard test
|
for KV_SPEC in "${KV_TYPES[@]}"; do
|
||||||
local_suffix="fa1"
|
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
if [[ "$KV_SPEC" == *:* ]]; then
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
|
KV_K="${KV_SPEC%%:*}"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
KV_V="${KV_SPEC##*:}"
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
|
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
|
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD[*]}"
|
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
|
||||||
log_success "Standard test complete"
|
|
||||||
tail -5 "$OUT"
|
|
||||||
else
|
else
|
||||||
log_error "Standard test failed (exit $?)"
|
KV_K="$KV_SPEC"
|
||||||
echo "FAILED" >> "$OUT"
|
KV_V="$KV_SPEC"
|
||||||
fi
|
fi
|
||||||
else
|
|
||||||
log_info "Skipping standard test (log exists): $OUT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Long-context test (pp2048, tg32, ctx 32768)
|
# Build KV cache args (skip for f16 — it's the default)
|
||||||
if $SKIP_LONGCTX; then
|
KV_ARGS=()
|
||||||
continue
|
KV_SUFFIX=""
|
||||||
fi
|
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||||
|
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||||
|
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||||
|
fi
|
||||||
|
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
|
# Build batch size args
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
BATCH_ARGS=()
|
||||||
printf "\n${BOLD}>> [%s] %s — long-context %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
|
BATCH_SUFFIX=""
|
||||||
|
if [[ -n "$BATCH_SIZE" ]]; then
|
||||||
|
BATCH_ARGS+=(-b "$BATCH_SIZE")
|
||||||
|
BATCH_SUFFIX="__b${BATCH_SIZE}"
|
||||||
|
fi
|
||||||
|
|
||||||
UB_SIZE=2048
|
# Standard test
|
||||||
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
local_suffix="fa1"
|
||||||
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
|
if [[ ! -s "$OUT" ]]; then
|
||||||
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
printf " cmd: %s\n" "${CMD[*]}"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
log_success "Standard test complete"
|
||||||
-r "$REPS_LONGCTX")
|
tail -5 "$OUT"
|
||||||
|
else
|
||||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
log_error "Standard test failed (exit $?)"
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
echo "FAILED" >> "$OUT"
|
||||||
log_success "Long-context test complete"
|
fi
|
||||||
tail -5 "$OUT_LC"
|
|
||||||
else
|
else
|
||||||
log_error "Long-context test failed (exit $?)"
|
log_info "Skipping standard test (log exists): $OUT"
|
||||||
echo "FAILED" >> "$OUT_LC"
|
|
||||||
fi
|
fi
|
||||||
else
|
|
||||||
log_info "Skipping long-context test (log exists): $OUT_LC"
|
# Long-context test
|
||||||
fi
|
if $SKIP_LONGCTX; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
|
printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
|
|
||||||
|
UB_SIZE=2048
|
||||||
|
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
|
|
||||||
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||||
|
-r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
|
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||||
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
|
log_success "Long-context test complete"
|
||||||
|
tail -5 "$OUT_LC"
|
||||||
|
else
|
||||||
|
log_error "Long-context test failed (exit $?)"
|
||||||
|
echo "FAILED" >> "$OUT_LC"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_info "Skipping long-context test (log exists): $OUT_LC"
|
||||||
|
fi
|
||||||
|
done # KV_TYPES
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -258,6 +303,10 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
if "FAILED" in content:
|
if "FAILED" in content:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Extract KV cache type from filename (__kv_q8_0-q8_0)
|
||||||
|
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
|
||||||
|
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||||
|
|
||||||
for line in content.splitlines():
|
for line in content.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||||||
@@ -266,12 +315,15 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
parts = [p.strip() for p in line.split("|")]
|
parts = [p.strip() for p in line.split("|")]
|
||||||
if len(parts) < 10:
|
# Filter out empty parts from leading/trailing pipes
|
||||||
|
data = [p for p in parts if p and "---" not in p]
|
||||||
|
if len(data) < 6:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
test_type = parts[8].strip() if len(parts) > 8 else ""
|
# test and t/s are always the last two columns
|
||||||
ts_raw = parts[9].strip() if len(parts) > 9 else ""
|
test_type = data[-2]
|
||||||
|
ts_raw = data[-1]
|
||||||
if not test_type or not ts_raw:
|
if not test_type or not ts_raw:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -281,11 +333,12 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"file": logfile.name,
|
"file": logfile.name,
|
||||||
"model": parts[1].strip(),
|
"model": data[0],
|
||||||
"size": parts[2].strip(),
|
"size": data[1],
|
||||||
"backend": parts[4].strip(),
|
"backend": data[3],
|
||||||
"test": test_type,
|
"test": test_type,
|
||||||
"tokens_per_sec": float(ts_match.group(1)),
|
"tokens_per_sec": float(ts_match.group(1)),
|
||||||
|
"kv_cache": kv_type,
|
||||||
"raw": ts_raw,
|
"raw": ts_raw,
|
||||||
})
|
})
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
@@ -307,13 +360,14 @@ if not data["results"]:
|
|||||||
print(" No results parsed. Check log files for errors.")
|
print(" No results parsed. Check log files for errors.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
|
||||||
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
|
||||||
print(" " + "-" * 58)
|
print(" " + "-" * 68)
|
||||||
for r in data["results"]:
|
for r in data["results"]:
|
||||||
print(fmt.format(
|
print(fmt.format(
|
||||||
r["model"][:20],
|
r["model"][:20],
|
||||||
r["backend"][:16],
|
r["backend"][:16],
|
||||||
|
r.get("kv_cache", "f16/f16")[:10],
|
||||||
r["test"],
|
r["test"],
|
||||||
f"{r['tokens_per_sec']:.2f}"
|
f"{r['tokens_per_sec']:.2f}"
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -33,6 +35,8 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
-b|--batch) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-suite.sh [OPTIONS]"
|
echo "Usage: run-suite.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -47,10 +51,16 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
|
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
|
||||||
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
|
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
|
||||||
|
echo " run-suite.sh --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
|
||||||
|
echo " run-suite.sh --tag kv-mixed --kv-types q8_0,q4_0:q8_0 --context 131072"
|
||||||
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
|
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
|
||||||
exit 0 ;;
|
exit 0 ;;
|
||||||
*) log_warn "Unknown argument: $1"; shift ;;
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
@@ -63,12 +73,20 @@ if (( CTX_DEPTH > 32768 )); then
|
|||||||
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Parse KV cache types for sweep
|
||||||
|
if [[ -n "$KV_TYPES_RAW" ]]; then
|
||||||
|
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
|
||||||
|
else
|
||||||
|
KV_TYPES=("f16")
|
||||||
|
fi
|
||||||
|
|
||||||
TS="$(timestamp)"
|
TS="$(timestamp)"
|
||||||
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
||||||
mkdir -p "$RESULT_DIR"
|
mkdir -p "$RESULT_DIR"
|
||||||
|
|
||||||
log_header "Benchmark Suite: $TAG"
|
log_header "Benchmark Suite: $TAG"
|
||||||
log_info "Results: $RESULT_DIR"
|
log_info "Results: $RESULT_DIR"
|
||||||
|
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
|
||||||
|
|
||||||
# Save system state
|
# Save system state
|
||||||
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
||||||
@@ -157,7 +175,11 @@ log_info "Models: ${#MODEL_PATHS[@]}"
|
|||||||
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
||||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
||||||
METRICS_PID=$!
|
METRICS_PID=$!
|
||||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null; true' EXIT
|
cleanup() {
|
||||||
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# Run benchmarks (same logic as run-baseline.sh)
|
# Run benchmarks (same logic as run-baseline.sh)
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
@@ -176,39 +198,68 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Standard test
|
for KV_SPEC in "${KV_TYPES[@]}"; do
|
||||||
local_suffix="fa1"
|
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
if [[ "$KV_SPEC" == *:* ]]; then
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
|
KV_K="${KV_SPEC%%:*}"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
KV_V="${KV_SPEC##*:}"
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
|
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
|
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
|
||||||
log_success "Done"; tail -3 "$OUT"
|
|
||||||
else
|
else
|
||||||
log_error "Failed"; echo "FAILED" >> "$OUT"
|
KV_K="$KV_SPEC"
|
||||||
|
KV_V="$KV_SPEC"
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
|
|
||||||
# Long-context test
|
# Build KV cache args (skip for f16 — it's the default)
|
||||||
if $SKIP_LONGCTX; then
|
KV_ARGS=()
|
||||||
continue
|
KV_SUFFIX=""
|
||||||
fi
|
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
|
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||||
printf "\n${BOLD}>> [%s] %s — longctx %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
|
|
||||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX")
|
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
|
||||||
log_success "Done"; tail -3 "$OUT_LC"
|
|
||||||
else
|
|
||||||
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
|
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
|
# Build batch size args
|
||||||
|
BATCH_ARGS=()
|
||||||
|
BATCH_SUFFIX=""
|
||||||
|
if [[ -n "$BATCH_SIZE" ]]; then
|
||||||
|
BATCH_ARGS+=(-b "$BATCH_SIZE")
|
||||||
|
BATCH_SUFFIX="__b${BATCH_SIZE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Standard test
|
||||||
|
local_suffix="fa1"
|
||||||
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
|
if [[ ! -s "$OUT" ]]; then
|
||||||
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
|
log_success "Done"; tail -3 "$OUT"
|
||||||
|
else
|
||||||
|
log_error "Failed"; echo "FAILED" >> "$OUT"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Long-context test
|
||||||
|
if $SKIP_LONGCTX; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
|
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
|
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||||
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
|
log_success "Done"; tail -3 "$OUT_LC"
|
||||||
|
else
|
||||||
|
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done # KV_TYPES
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -226,6 +277,11 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
content = logfile.read_text()
|
content = logfile.read_text()
|
||||||
if "FAILED" in content:
|
if "FAILED" in content:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Extract KV cache type from filename (__kv_q8_0-q8_0)
|
||||||
|
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
|
||||||
|
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||||
|
|
||||||
for line in content.splitlines():
|
for line in content.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||||||
@@ -233,21 +289,25 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
if "---" in line:
|
if "---" in line:
|
||||||
continue
|
continue
|
||||||
parts = [p.strip() for p in line.split("|")]
|
parts = [p.strip() for p in line.split("|")]
|
||||||
if len(parts) < 10:
|
# Filter out empty parts from leading/trailing pipes
|
||||||
|
data = [p for p in parts if p and "---" not in p]
|
||||||
|
if len(data) < 6:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
test_type = parts[8].strip()
|
# test and t/s are always the last two columns
|
||||||
ts_raw = parts[9].strip()
|
test_type = data[-2]
|
||||||
|
ts_raw = data[-1]
|
||||||
ts_match = re.match(r'([\d.]+)', ts_raw)
|
ts_match = re.match(r'([\d.]+)', ts_raw)
|
||||||
if not ts_match:
|
if not ts_match:
|
||||||
continue
|
continue
|
||||||
results.append({
|
results.append({
|
||||||
"file": logfile.name,
|
"file": logfile.name,
|
||||||
"model": parts[1].strip(),
|
"model": data[0],
|
||||||
"size": parts[2].strip(),
|
"size": data[1],
|
||||||
"backend": parts[4].strip(),
|
"backend": data[3],
|
||||||
"test": test_type,
|
"test": test_type,
|
||||||
"tokens_per_sec": float(ts_match.group(1)),
|
"tokens_per_sec": float(ts_match.group(1)),
|
||||||
|
"kv_cache": kv_type,
|
||||||
"raw": ts_raw,
|
"raw": ts_raw,
|
||||||
})
|
})
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
@@ -264,11 +324,14 @@ with open(sys.argv[1]) as f:
|
|||||||
if not data["results"]:
|
if not data["results"]:
|
||||||
print(" No results parsed.")
|
print(" No results parsed.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
|
||||||
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
|
||||||
print(" " + "-" * 58)
|
print(" " + "-" * 68)
|
||||||
for r in data["results"]:
|
for r in data["results"]:
|
||||||
print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}"))
|
print(fmt.format(
|
||||||
|
r["model"][:20], r["backend"][:16],
|
||||||
|
r.get("kv_cache", "f16/f16")[:10], r["test"],
|
||||||
|
f"{r['tokens_per_sec']:.2f}"))
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
111
scripts/optimize/power-profile.sh
Normal file
111
scripts/optimize/power-profile.sh
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Apply power profile and system tuning for LLM inference workloads
|
||||||
|
# Requires root. Settings are volatile — use the systemd service for persistence.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||||
|
source "$SCRIPT_DIR/../../lib/format.sh"
|
||||||
|
|
||||||
|
require_root
|
||||||
|
|
||||||
|
# ── Power limits via ryzenadj ─────────────────────────────
|
||||||
|
STAPM=85000
|
||||||
|
FAST=85000
|
||||||
|
SLOW=85000
|
||||||
|
APU_SLOW=85000
|
||||||
|
|
||||||
|
if is_cmd ryzenadj; then
|
||||||
|
log_header "Power Profile (ryzenadj)"
|
||||||
|
log_info "Setting STAPM=${STAPM}mW, Fast=${FAST}mW, Slow=${SLOW}mW, APU=${APU_SLOW}mW"
|
||||||
|
ryzenadj \
|
||||||
|
--stapm-limit=$STAPM \
|
||||||
|
--fast-limit=$FAST \
|
||||||
|
--slow-limit=$SLOW \
|
||||||
|
--apu-slow-limit=$APU_SLOW 2>&1 | grep -E 'Successfully|Error|not supported' || true
|
||||||
|
|
||||||
|
# Verify what actually took effect
|
||||||
|
log_info "Verifying limits..."
|
||||||
|
ryzenadj -i 2>&1 | grep -E 'LIMIT|VALUE' | head -8
|
||||||
|
echo ""
|
||||||
|
log_warn "Note: HP firmware may cap PPT SLOW/APU at 70W regardless of setting"
|
||||||
|
else
|
||||||
|
log_error "ryzenadj not found. Install: cd /tmp && git clone https://github.com/FlyGoat/RyzenAdj.git && cd RyzenAdj && mkdir build && cd build && cmake .. && make && sudo cp ryzenadj /usr/local/bin/"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── VM sysctl tuning ──────────────────────────────────────
|
||||||
|
log_header "VM Sysctl Tuning"
|
||||||
|
|
||||||
|
declare -A SYSCTLS=(
|
||||||
|
[vm.swappiness]=1
|
||||||
|
[vm.dirty_ratio]=40
|
||||||
|
[vm.dirty_background_ratio]=10
|
||||||
|
[vm.max_map_count]=500000
|
||||||
|
[vm.zone_reclaim_mode]=0
|
||||||
|
)
|
||||||
|
|
||||||
|
for KEY in "${!SYSCTLS[@]}"; do
|
||||||
|
VAL="${SYSCTLS[$KEY]}"
|
||||||
|
CURRENT=$(sysctl -n "$KEY" 2>/dev/null || echo "?")
|
||||||
|
if [[ "$CURRENT" == "$VAL" ]]; then
|
||||||
|
log_success "$KEY = $VAL (already set)"
|
||||||
|
else
|
||||||
|
sysctl -w "$KEY=$VAL" > /dev/null 2>&1
|
||||||
|
log_success "$KEY = $VAL (was $CURRENT)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Persist sysctl settings
|
||||||
|
SYSCTL_CONF="/etc/sysctl.d/99-llm-inference.conf"
|
||||||
|
if [[ ! -f "$SYSCTL_CONF" ]]; then
|
||||||
|
log_info "Persisting to $SYSCTL_CONF"
|
||||||
|
cat > "$SYSCTL_CONF" << 'EOF'
|
||||||
|
# LLM inference optimizations
|
||||||
|
vm.swappiness = 1
|
||||||
|
vm.dirty_ratio = 40
|
||||||
|
vm.dirty_background_ratio = 10
|
||||||
|
vm.max_map_count = 500000
|
||||||
|
vm.zone_reclaim_mode = 0
|
||||||
|
EOF
|
||||||
|
log_success "Sysctl config saved (persists across reboots)"
|
||||||
|
else
|
||||||
|
log_info "Sysctl config already exists at $SYSCTL_CONF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Transparent Huge Pages ────────────────────────────────
|
||||||
|
log_header "Transparent Huge Pages"
|
||||||
|
|
||||||
|
THP_ENABLED=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null || echo "unknown")
|
||||||
|
if [[ "$THP_ENABLED" == *"[always]"* ]]; then
|
||||||
|
log_success "THP = always (already set)"
|
||||||
|
else
|
||||||
|
echo always > /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null || true
|
||||||
|
echo defer+madvise > /sys/kernel/mm/transparent_hugepage/defrag 2>/dev/null || true
|
||||||
|
log_success "THP = always, defrag = defer+madvise"
|
||||||
|
fi
|
||||||
|
log_info "For persistence, add to kernel cmdline: transparent_hugepage=always"
|
||||||
|
|
||||||
|
# ── RADV nogttspill ───────────────────────────────────────
|
||||||
|
log_header "Vulkan RADV Environment"
|
||||||
|
|
||||||
|
RADV_CONF="/etc/environment.d/radv-llm.conf"
|
||||||
|
if [[ ! -f "$RADV_CONF" ]]; then
|
||||||
|
mkdir -p /etc/environment.d
|
||||||
|
echo 'RADV_PERFTEST=nogttspill' > "$RADV_CONF"
|
||||||
|
log_success "RADV_PERFTEST=nogttspill persisted to $RADV_CONF"
|
||||||
|
log_info "Takes effect on next login. For this session: export RADV_PERFTEST=nogttspill"
|
||||||
|
else
|
||||||
|
log_success "RADV config already exists at $RADV_CONF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Summary ───────────────────────────────────────────────
|
||||||
|
log_header "Phase 2 Optimization Summary"
|
||||||
|
log_success "Power profile: ryzenadj limits applied (volatile — resets on reboot)"
|
||||||
|
log_success "VM tuning: sysctl applied and persisted"
|
||||||
|
log_success "THP: enabled (volatile — add to kernel cmdline for persistence)"
|
||||||
|
log_success "RADV: nogttspill persisted"
|
||||||
|
echo ""
|
||||||
|
log_info "To persist ryzenadj across reboots:"
|
||||||
|
log_info " sudo cp $SCRIPT_DIR/../../configs/ryzenadj-llm.service /etc/systemd/system/"
|
||||||
|
log_info " sudo systemctl enable --now ryzenadj-llm.service"
|
||||||
136
scripts/serve/launch.sh
Executable file
136
scripts/serve/launch.sh
Executable file
@@ -0,0 +1,136 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Launch llama-server with optimized settings for Strix Halo
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||||
|
source "$SCRIPT_DIR/../../lib/detect.sh"
|
||||||
|
source "$SCRIPT_DIR/../../lib/format.sh"
|
||||||
|
|
||||||
|
MODEL_DIR="$(data_dir models)"
|
||||||
|
BACKEND="llama-vulkan-radv"
|
||||||
|
PORT=8080
|
||||||
|
CTX_SIZE=131072
|
||||||
|
PARALLEL=1
|
||||||
|
MODEL=""
|
||||||
|
NGRAM=false
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
-m|--model) MODEL="$2"; shift 2 ;;
|
||||||
|
--backend) BACKEND="$2"; shift 2 ;;
|
||||||
|
--port) PORT="$2"; shift 2 ;;
|
||||||
|
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
||||||
|
--parallel) PARALLEL="$2"; shift 2 ;;
|
||||||
|
--ngram) NGRAM=true; shift ;;
|
||||||
|
--help|-h)
|
||||||
|
echo "Usage: launch.sh [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " -m, --model FILE GGUF model filename (searches data/models/)"
|
||||||
|
echo " --backend NAME Toolbox backend (default: llama-vulkan-radv)"
|
||||||
|
echo " --port N Listen port (default: 8080)"
|
||||||
|
echo " --ctx N Context size (default: 131072)"
|
||||||
|
echo " --parallel N Parallel request slots (default: 1)"
|
||||||
|
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
||||||
|
echo ""
|
||||||
|
echo "Presets (pass model filename):"
|
||||||
|
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
||||||
|
echo " Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf Agentic coding"
|
||||||
|
echo " Qwen3-Coder-Next-UD-Q3_K_XL.gguf Complex SE tasks"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
|
||||||
|
echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf --ngram --ctx 262144"
|
||||||
|
echo " launch.sh -m Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf --parallel 2"
|
||||||
|
exit 0 ;;
|
||||||
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -z "$MODEL" ]]; then
|
||||||
|
log_error "No model specified. Use -m MODEL_FILENAME"
|
||||||
|
echo ""
|
||||||
|
echo "Available models:"
|
||||||
|
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
||||||
|
-not -name '*-000*-of-*.gguf' -printf ' %f\n' 2>/dev/null | sort
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Find model file
|
||||||
|
MODEL_PATH="$(find -L "$MODEL_DIR" -type f -name "$MODEL" -print -quit 2>/dev/null)"
|
||||||
|
if [[ -z "$MODEL_PATH" ]]; then
|
||||||
|
log_error "Model not found: $MODEL"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve for toolbox
|
||||||
|
TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")"
|
||||||
|
if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then
|
||||||
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Backend-specific settings
|
||||||
|
declare -A SERVER_PATHS=(
|
||||||
|
[llama-vulkan-radv]="/usr/sbin/llama-server"
|
||||||
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
||||||
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
||||||
|
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
||||||
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
||||||
|
)
|
||||||
|
|
||||||
|
SERVER_BIN="${SERVER_PATHS[$BACKEND]:-}"
|
||||||
|
if [[ -z "$SERVER_BIN" ]]; then
|
||||||
|
log_error "Unknown backend: $BACKEND"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check toolbox exists
|
||||||
|
if ! toolbox list 2>/dev/null | grep -q "$BACKEND"; then
|
||||||
|
log_error "Toolbox not found: $BACKEND"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build environment args
|
||||||
|
ENV_ARGS=()
|
||||||
|
if [[ "$BACKEND" == *rocm* ]]; then
|
||||||
|
ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build server args
|
||||||
|
SERVER_ARGS=(
|
||||||
|
-ngl 99 # Full GPU offload
|
||||||
|
--no-mmap # Direct load, no mmap overhead
|
||||||
|
-fa # Flash attention
|
||||||
|
-m "$TOOLBOX_MODEL_PATH"
|
||||||
|
-c "$CTX_SIZE" # Context size
|
||||||
|
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||||
|
--cache-type-v q4_0
|
||||||
|
--port "$PORT"
|
||||||
|
-np "$PARALLEL" # Parallel slots
|
||||||
|
)
|
||||||
|
|
||||||
|
# N-gram speculative decoding
|
||||||
|
if $NGRAM; then
|
||||||
|
SERVER_ARGS+=(
|
||||||
|
--spec-type ngram-simple
|
||||||
|
--draft-max 64
|
||||||
|
--draft-min 4
|
||||||
|
)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Display config
|
||||||
|
log_header "llama-server"
|
||||||
|
log_info "Model: $(basename "$MODEL_PATH") ($(du -h "$MODEL_PATH" | cut -f1))"
|
||||||
|
log_info "Backend: $BACKEND"
|
||||||
|
log_info "Context: $CTX_SIZE tokens"
|
||||||
|
log_info "KV cache: q4_0/q4_0"
|
||||||
|
log_info "Parallel slots: $PARALLEL"
|
||||||
|
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
||||||
|
log_info "Port: $PORT"
|
||||||
|
log_info "Endpoint: http://localhost:$PORT"
|
||||||
|
echo ""
|
||||||
|
log_info "Starting server..."
|
||||||
|
|
||||||
|
# Launch
|
||||||
|
exec toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$SERVER_BIN" "${SERVER_ARGS[@]}"
|
||||||
@@ -10,6 +10,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "run-suite --help shows usage and exits 0" {
|
@test "run-suite --help shows usage and exits 0" {
|
||||||
@@ -20,6 +21,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--tag"
|
assert_output --partial "--tag"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher shows help with no args" {
|
@test "benchmark dispatcher shows help with no args" {
|
||||||
@@ -28,6 +30,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "Commands"
|
assert_output --partial "Commands"
|
||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher passes --help through to baseline" {
|
@test "benchmark dispatcher passes --help through to baseline" {
|
||||||
|
|||||||
Reference in New Issue
Block a user