feat(serve): set APEX I-Compact as default, harden benchmark workflow
Serving: - make serve now launches Claude-distilled APEX 35B-A3B (16GB) with 2 parallel slots and 256K context as the daily driver - add serve-custom for ad-hoc model testing - add flush-gpu to reclaim unified memory after stuck runs Benchmarks: - default Vulkan-only backends (ROCm trails at long context) - add --backends filter to run-baseline.sh - fix backend filter substring bug (grep -qFx for exact line match) - fix model filter regex metacharacter bug (grep -qiF for literal) - respect --tg in long-context tests instead of hardcoded n=32 ROCm bump to 7.2.1 (kernel 6.18.4+ patch); keep 7.2 as optional. Catalog: - add mudler APEX I-Compact (Claude-distilled 35B, 17GB) - add 0xSero REAP-40 (pruned 122B-A10B, 46GB) - update download instructions: hf download (huggingface-cli is gone)
This commit is contained in:
13
Makefile
13
Makefile
@@ -39,12 +39,23 @@ benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare
|
|||||||
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
||||||
|
|
||||||
# --- Serve ---
|
# --- Serve ---
|
||||||
serve: ## Launch llama-server with optimized settings (ARGS="-m MODEL.gguf")
|
serve: ## Launch APEX I-Compact daily driver (2 slots, 256K ctx)
|
||||||
|
@bash bin/serve -m Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf --parallel 2 --ctx 262144 $(ARGS)
|
||||||
|
|
||||||
|
serve-custom: ## Launch llama-server with custom model (ARGS="-m MODEL.gguf")
|
||||||
@bash bin/serve $(ARGS)
|
@bash bin/serve $(ARGS)
|
||||||
|
|
||||||
serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
|
serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
|
||||||
@bash bin/serve --ngram $(ARGS)
|
@bash bin/serve --ngram $(ARGS)
|
||||||
|
|
||||||
|
flush-gpu: ## Kill llama-server/bench processes and drop kernel caches to free unified VRAM
|
||||||
|
-@pkill -x llama-server 2>/dev/null || true
|
||||||
|
-@pkill -x llama-bench 2>/dev/null || true
|
||||||
|
-@pkill -x llama-cli 2>/dev/null || true
|
||||||
|
-@podman ps --filter name=llama --format '{{.Names}}' | xargs -r podman stop
|
||||||
|
@sync && sudo sysctl vm.drop_caches=3
|
||||||
|
@echo "VRAM usage:" && cat /sys/class/drm/card*/device/mem_info_vram_used 2>/dev/null | awk '{printf " %.2f MiB\n", $$1/1048576}'
|
||||||
|
|
||||||
# --- Hardware Info ---
|
# --- Hardware Info ---
|
||||||
hw-bandwidth: ## Measure GPU memory bandwidth and compute (clpeak)
|
hw-bandwidth: ## Measure GPU memory bandwidth and compute (clpeak)
|
||||||
@clpeak 2>&1
|
@clpeak 2>&1
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# Format: NAME|HF_REPO|FILE|SIZE_GB|CATEGORY|DESCRIPTION
|
# Format: NAME|HF_REPO|FILE|SIZE_GB|CATEGORY|DESCRIPTION
|
||||||
#
|
#
|
||||||
# Categories: smoke, standard, moe, dense
|
# Categories: smoke, standard, moe, dense
|
||||||
# Download with: huggingface-cli download REPO FILE --local-dir /data/models/llms/REPO
|
# Download with: hf download REPO FILE --local-dir /data/models/llms/REPO
|
||||||
|
|
||||||
# ── Smoke tests (quick, small) ───────────────────────────
|
# ── Smoke tests (quick, small) ───────────────────────────
|
||||||
qwen2.5-0.5b-q8|lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF|Qwen2.5-0.5B-Instruct-Q8_0.gguf|0.4|smoke|Tiny Qwen2.5, Q8
|
qwen2.5-0.5b-q8|lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF|Qwen2.5-0.5B-Instruct-Q8_0.gguf|0.4|smoke|Tiny Qwen2.5, Q8
|
||||||
@@ -26,11 +26,15 @@ qwen3.5-27b-opus-distill|Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distille
|
|||||||
# ── MoE models (fast generation, best for 64GB) ─────────
|
# ── MoE models (fast generation, best for 64GB) ─────────
|
||||||
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
||||||
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
||||||
|
qwen3.5-35b-a3b-apex-compact|mudler/Qwen3.5-35B-A3B-Claude-Distilled-APEX-GGUF|Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf|17|moe|MoE 35B Claude-distilled APEX, I-Compact quant
|
||||||
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
||||||
|
|
||||||
# ── Coding models ─────────────────────────────────────────
|
# ── Coding models ─────────────────────────────────────────
|
||||||
qwen3-coder-30b-a3b-q6|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf|26|moe|Agentic coding MoE, pure Transformer
|
qwen3-coder-30b-a3b-q6|unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF|Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf|26|moe|Agentic coding MoE, pure Transformer
|
||||||
qwen3-coder-next-q3|unsloth/Qwen3-Coder-Next-GGUF|Qwen3-Coder-Next-UD-Q3_K_XL.gguf|34|moe|80B MoE coder, >70% SWE-bench, hybrid DeltaNet
|
qwen3-coder-next-q3|unsloth/Qwen3-Coder-Next-GGUF|Qwen3-Coder-Next-UD-Q3_K_XL.gguf|34|moe|80B MoE coder, >70% SWE-bench, hybrid DeltaNet
|
||||||
|
|
||||||
|
# ── Pruned MoE (REAP expert pruning) ─────────────────────
|
||||||
|
qwen3.5-122b-a10b-reap40-q4|0xSero/Qwen3.5-122B-A10B-REAP-40-GGUF|Qwen3.5-122B-A10B-REAP-40-Q4_K_M.gguf|46|moe|122B MoE pruned to 40 experts, 10B active, Q4_K_M
|
||||||
|
|
||||||
# ── Draft models (speculative decoding) ───────────────────
|
# ── Draft models (speculative decoding) ───────────────────
|
||||||
qwen3.5-0.8b-q8-draft|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|draft|Draft for Qwen3.5 speculative decoding
|
qwen3.5-0.8b-q8-draft|unsloth/Qwen3.5-0.8B-GGUF|Qwen3.5-0.8B-Q8_0.gguf|0.8|draft|Draft for Qwen3.5 speculative decoding
|
||||||
|
|||||||
@@ -41,7 +41,8 @@ The `-fa 1 -mmp 0 -ngl 99` flags are **mandatory** on Strix Halo to avoid crashe
|
|||||||
| `llama-vulkan-radv` | Mesa RADV | Vulkan | Most stable, recommended default |
|
| `llama-vulkan-radv` | Mesa RADV | Vulkan | Most stable, recommended default |
|
||||||
| `llama-vulkan-amdvlk` | AMDVLK | Vulkan | Fastest when it works, 2GB buffer limit |
|
| `llama-vulkan-amdvlk` | AMDVLK | Vulkan | Fastest when it works, 2GB buffer limit |
|
||||||
| `llama-rocm-6.4.4` | ROCm 6.4.4 | HIP | Proven stable |
|
| `llama-rocm-6.4.4` | ROCm 6.4.4 | HIP | Proven stable |
|
||||||
| `llama-rocm-7.2` | ROCm 7.2 | HIP | Latest, compiler fixes applied |
|
| `llama-rocm-7.2.1` | ROCm 7.2.1 | HIP | Current stable (kernel 6.18.4+ patch) |
|
||||||
|
| `llama-rocm-7.2` | ROCm 7.2 | HIP | Deprecated — use 7.2.1 |
|
||||||
| `llama-rocm7-nightlies` | ROCm 7 nightly | HIP | Experimental/development builds |
|
| `llama-rocm7-nightlies` | ROCm 7 nightly | HIP | Experimental/development builds |
|
||||||
|
|
||||||
Containers are from [kyuz0/amd-strix-halo-toolboxes](https://github.com/kyuz0/amd-strix-halo-toolboxes). Set up with `make benchmark-setup`.
|
Containers are from [kyuz0/amd-strix-halo-toolboxes](https://github.com/kyuz0/amd-strix-halo-toolboxes). Set up with `make benchmark-setup`.
|
||||||
|
|||||||
@@ -23,12 +23,14 @@ PP_TOKENS=512
|
|||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
BACKENDS_FILTER="llama-vulkan-radv" # Default to Vulkan; use --backends to override
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
||||||
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
||||||
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
||||||
|
--backends) BACKENDS_FILTER="$2"; shift 2 ;;
|
||||||
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
@@ -42,6 +44,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --skip-longctx Skip long-context tests"
|
echo " --skip-longctx Skip long-context tests"
|
||||||
echo " --max-size GB Only bench models up to this file size in GB"
|
echo " --max-size GB Only bench models up to this file size in GB"
|
||||||
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
||||||
|
echo " --backends LIST Comma-separated backends (default: llama-vulkan-radv)"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
@@ -94,15 +97,18 @@ declare -A BENCH_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||||
)
|
)
|
||||||
|
|
||||||
available_backends=()
|
available_backends=()
|
||||||
for tb in "${!BENCH_PATHS[@]}"; do
|
for tb in "${!BENCH_PATHS[@]}"; do
|
||||||
if echo "$existing" | grep -q "^${tb}$"; then
|
if echo "$existing" | grep -q "^${tb}$"; then
|
||||||
|
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -qFx "$tb"; then
|
||||||
available_backends+=("$tb")
|
available_backends+=("$tb")
|
||||||
log_success "Backend: $tb"
|
log_success "Backend: $tb"
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if (( ${#available_backends[@]} == 0 )); then
|
if (( ${#available_backends[@]} == 0 )); then
|
||||||
@@ -269,7 +275,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
|
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
-p "$CTX_PROMPT" -n "$TG_TOKENS" -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||||
-r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
-r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ source "$SCRIPT_DIR/../../lib/format.sh"
|
|||||||
|
|
||||||
MODEL_DIR="$(data_dir models)"
|
MODEL_DIR="$(data_dir models)"
|
||||||
TAG="run"
|
TAG="run"
|
||||||
BACKENDS_FILTER=""
|
BACKENDS_FILTER="llama-vulkan-radv"
|
||||||
MODELS_FILTER=""
|
MODELS_FILTER=""
|
||||||
SKIP_LONGCTX=false
|
SKIP_LONGCTX=false
|
||||||
MAX_SIZE_GB=0
|
MAX_SIZE_GB=0
|
||||||
@@ -99,13 +99,14 @@ declare -A BENCH_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-bench"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||||
)
|
)
|
||||||
|
|
||||||
available_backends=()
|
available_backends=()
|
||||||
for tb in "${!BENCH_PATHS[@]}"; do
|
for tb in "${!BENCH_PATHS[@]}"; do
|
||||||
if echo "$existing" | grep -q "^${tb}$"; then
|
if echo "$existing" | grep -q "^${tb}$"; then
|
||||||
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then
|
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -qFx "$tb"; then
|
||||||
available_backends+=("$tb")
|
available_backends+=("$tb")
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@@ -130,7 +131,7 @@ for p in "${ALL_MODEL_PATHS[@]}"; do
|
|||||||
|
|
||||||
# Name filter
|
# Name filter
|
||||||
if [[ -n "$MODELS_FILTER" ]]; then
|
if [[ -n "$MODELS_FILTER" ]]; then
|
||||||
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
|
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qiF "$local_name"; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
@@ -252,7 +253,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
-p "$CTX_PROMPT" -n "$TG_TOKENS" -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT_LC"
|
log_success "Done"; tail -3 "$OUT_LC"
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ log_header "Benchmark Setup"
|
|||||||
# ── 1. Check toolbox containers ──────────────────────────
|
# ── 1. Check toolbox containers ──────────────────────────
|
||||||
log_info "Checking toolbox containers..."
|
log_info "Checking toolbox containers..."
|
||||||
|
|
||||||
REQUIRED_TOOLBOXES=("llama-vulkan-radv" "llama-rocm-7.2")
|
REQUIRED_TOOLBOXES=("llama-vulkan-radv" "llama-rocm-7.2.1")
|
||||||
OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-vulkan-amdvlk")
|
OPTIONAL_TOOLBOXES=("llama-rocm-7.2" "llama-rocm-6.4.4" "llama-vulkan-amdvlk")
|
||||||
|
|
||||||
existing=$(detect_toolbox_names 2>/dev/null || true)
|
existing=$(detect_toolbox_names 2>/dev/null || true)
|
||||||
missing=()
|
missing=()
|
||||||
|
|||||||
@@ -79,6 +79,7 @@ declare -A SERVER_PATHS=(
|
|||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
||||||
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
||||||
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
||||||
|
[llama-rocm-7.2.1]="/usr/local/bin/llama-server"
|
||||||
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user