Files
strix-halo-optimizations/scripts/benchmark/run-suite.sh
Felipe Cardoso f92b710492 fix(benchmark): parse llama-bench output with variable column count
KV cache quantization adds type_k/type_v columns to llama-bench output,
shifting test and t/s to different indices. Parse from end of row instead
of hardcoded positions. Also fix KV suffix separator (underscore to dash)
to avoid regex ambiguity with type names like q8_0.

Add 5-phase optimization guide, optimization log for tracking results,
and research docs on llama.cpp and inference landscape optimizations.
2026-03-27 14:54:19 +01:00

328 lines
12 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Full benchmark suite — run all backends × models with tagging
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
MODEL_DIR="$(data_dir models)"
TAG="run"
BACKENDS_FILTER=""
MODELS_FILTER=""
SKIP_LONGCTX=false
MAX_SIZE_GB=0
CATEGORY_FILTER=""
REPS_STANDARD=5
REPS_LONGCTX=3
CTX_DEPTH=32768
CTX_PROMPT=2048
PP_TOKENS=512
TG_TOKENS=128
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
while [[ $# -gt 0 ]]; do
case "$1" in
--tag|-t) TAG="$2"; shift 2 ;;
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
--skip-longctx) SKIP_LONGCTX=true; shift ;;
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
--pp) PP_TOKENS="$2"; shift 2 ;;
--tg) TG_TOKENS="$2"; shift 2 ;;
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
--help|-h)
echo "Usage: run-suite.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --tag NAME Tag this run (default: run)"
echo " --backends LIST Comma-separated backend filter"
echo " --models LIST Comma-separated model filename filter"
echo " --skip-longctx Skip long-context tests"
echo " --max-size GB Only bench models up to this file size in GB"
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
echo " --reps N Standard test repetitions (default: 5)"
echo " --context N Long-context depth in tokens (default: 32768)"
echo " --pp N Prompt processing tokens (default: 512)"
echo " --tg N Token generation count (default: 128)"
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
echo " Types: f16, q8_0, q4_0, q4_1"
echo ""
echo "Examples:"
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
echo " run-suite.sh --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
echo " run-suite.sh --tag kv-mixed --kv-types q8_0,q4_0:q8_0 --context 131072"
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;;
esac
done
# Scale prompt tokens for context: use ~1/16 of context depth, min 512
if (( CTX_DEPTH > 32768 )); then
CTX_PROMPT=$(( CTX_DEPTH / 16 ))
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
fi
# Parse KV cache types for sweep
if [[ -n "$KV_TYPES_RAW" ]]; then
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
else
KV_TYPES=("f16")
fi
TS="$(timestamp)"
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
mkdir -p "$RESULT_DIR"
log_header "Benchmark Suite: $TAG"
log_info "Results: $RESULT_DIR"
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
# Save system state
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
# Discover backends
existing="$(detect_toolbox_names 2>/dev/null || true)"
declare -A BENCH_PATHS=(
[llama-vulkan-radv]="/usr/sbin/llama-bench"
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
)
available_backends=()
for tb in "${!BENCH_PATHS[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then
available_backends+=("$tb")
fi
fi
done
if (( ${#available_backends[@]} == 0 )); then
log_error "No matching backends. Run: make benchmark-setup"
exit 1
fi
log_info "Backends: ${available_backends[*]}"
# Find and filter models
mapfile -t ALL_MODEL_PATHS < <(
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
MODEL_PATHS=()
for p in "${ALL_MODEL_PATHS[@]}"; do
local_name="$(basename "$p")"
# Name filter
if [[ -n "$MODELS_FILTER" ]]; then
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
continue
fi
fi
# Size filter
if (( MAX_SIZE_GB > 0 )); then
file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 ))
if (( file_size_gb >= MAX_SIZE_GB )); then
log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)"
continue
fi
fi
# Category filter
if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
matched=false
found_in_catalog=false
while IFS='|' read -r name repo file size_gb category desc; do
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
if [[ "$local_name" == "$file" ]]; then
found_in_catalog=true
echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true
break
fi
done < "$PROJECT_ROOT/configs/models.conf"
if $found_in_catalog && ! $matched; then
log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)"
continue
fi
fi
MODEL_PATHS+=("$p")
done
if (( ${#MODEL_PATHS[@]} == 0 )); then
log_error "No models matched filters. Run: make benchmark-setup"
exit 1
fi
log_info "Models: ${#MODEL_PATHS[@]}"
# Start metric logging
METRICS_FILE="$RESULT_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
METRICS_PID=$!
cleanup() {
kill "$METRICS_PID" 2>/dev/null || true
wait "$METRICS_PID" 2>/dev/null || true
}
trap 'cleanup; exit 0' EXIT
# Run benchmarks (same logic as run-baseline.sh)
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for BACKEND in "${available_backends[@]}"; do
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
BACKEND_SAFE="${BACKEND//[.-]/_}"
ENV_ARGS=()
[[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
# Resolve model path for toolbox
TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")"
if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
fi
for KV_SPEC in "${KV_TYPES[@]}"; do
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
if [[ "$KV_SPEC" == *:* ]]; then
KV_K="${KV_SPEC%%:*}"
KV_V="${KV_SPEC##*:}"
else
KV_K="$KV_SPEC"
KV_V="$KV_SPEC"
fi
# Build KV cache args (skip for f16 — it's the default)
KV_ARGS=()
KV_SUFFIX=""
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
fi
# Standard test
local_suffix="fa1"
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log"
if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}")
if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Done"; tail -3 "$OUT"
else
log_error "Failed"; echo "FAILED" >> "$OUT"
fi
fi
# Long-context test
if $SKIP_LONGCTX; then
continue
fi
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log"
if [[ ! -s "$OUT_LC" ]]; then
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Done"; tail -3 "$OUT_LC"
else
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
fi
fi
done # KV_TYPES
done
done
# Parse results
SUMMARY="$RESULT_DIR/summary.json"
# Parse llama-bench log files into summary JSON
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
import sys, os, re, json
from pathlib import Path
result_dir = Path(sys.argv[1])
results = []
for logfile in sorted(result_dir.glob("*.log")):
content = logfile.read_text()
if "FAILED" in content:
continue
# Extract KV cache type from filename (__kv_q8_0-q8_0)
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
for line in content.splitlines():
line = line.strip()
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
continue
if "---" in line:
continue
parts = [p.strip() for p in line.split("|")]
# Filter out empty parts from leading/trailing pipes
data = [p for p in parts if p and "---" not in p]
if len(data) < 6:
continue
try:
# test and t/s are always the last two columns
test_type = data[-2]
ts_raw = data[-1]
ts_match = re.match(r'([\d.]+)', ts_raw)
if not ts_match:
continue
results.append({
"file": logfile.name,
"model": data[0],
"size": data[1],
"backend": data[3],
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"kv_cache": kv_type,
"raw": ts_raw,
})
except (ValueError, IndexError):
continue
print(json.dumps({"results": results}, indent=2))
PYEOF
log_header "Results"
python3 - "$SUMMARY" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
data = json.load(f)
if not data["results"]:
print(" No results parsed.")
sys.exit(0)
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
print(" " + "-" * 68)
for r in data["results"]:
print(fmt.format(
r["model"][:20], r["backend"][:16],
r.get("kv_cache", "f16/f16")[:10], r["test"],
f"{r['tokens_per_sec']:.2f}"))
PYEOF
echo ""
log_success "Results saved to: $RESULT_DIR"