Files
strix-halo-optimizations/scripts/benchmark/run-suite.sh
Felipe Cardoso cb25fa3f6f feat: add benchmark filtering (--max-size, --category, --skip-longctx)
Both run-baseline.sh and run-suite.sh now support:
- --max-size GB: skip models larger than N GB (prevents OOM)
- --category LIST: filter by catalog category (smoke,dense,moe)
- --skip-longctx: skip 32K context tests (saves time + memory)
- --reps N: configure repetition count
- --help: shows usage with examples

Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx
Full post-optimization: benchmark baseline (no filters, all models + longctx)

Also: 4 new BATS tests for flag parsing (98 total, all passing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 19:07:24 +01:00

246 lines
8.1 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Full benchmark suite — run all backends × models with tagging
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
MODEL_DIR="$(data_dir models)"
TAG="run"
BACKENDS_FILTER=""
MODELS_FILTER=""
SKIP_LONGCTX=false
MAX_SIZE_GB=0
CATEGORY_FILTER=""
REPS_STANDARD=5
REPS_LONGCTX=3
while [[ $# -gt 0 ]]; do
case "$1" in
--tag|-t) TAG="$2"; shift 2 ;;
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
--skip-longctx) SKIP_LONGCTX=true; shift ;;
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
--help|-h)
echo "Usage: run-suite.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --tag NAME Tag this run (default: run)"
echo " --backends LIST Comma-separated backend filter"
echo " --models LIST Comma-separated model filename filter"
echo " --skip-longctx Skip long-context (32K) tests"
echo " --max-size GB Only bench models up to this file size in GB"
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
echo " --reps N Standard test repetitions (default: 5)"
exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;;
esac
done
TS="$(timestamp)"
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
mkdir -p "$RESULT_DIR"
log_header "Benchmark Suite: $TAG"
log_info "Results: $RESULT_DIR"
# Save system state
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
# Discover backends
existing="$(detect_toolbox_names 2>/dev/null || true)"
declare -A BENCH_PATHS=(
[llama-vulkan-radv]="/usr/sbin/llama-bench"
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
)
available_backends=()
for tb in "${!BENCH_PATHS[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then
available_backends+=("$tb")
fi
fi
done
if (( ${#available_backends[@]} == 0 )); then
log_error "No matching backends. Run: make benchmark-setup"
exit 1
fi
log_info "Backends: ${available_backends[*]}"
# Find and filter models
mapfile -t ALL_MODEL_PATHS < <(
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
MODEL_PATHS=()
for p in "${ALL_MODEL_PATHS[@]}"; do
local_name="$(basename "$p")"
# Name filter
if [[ -n "$MODELS_FILTER" ]]; then
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
continue
fi
fi
# Size filter
if (( MAX_SIZE_GB > 0 )); then
file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 ))
if (( file_size_gb >= MAX_SIZE_GB )); then
log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)"
continue
fi
fi
# Category filter
if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
matched=false
found_in_catalog=false
while IFS='|' read -r name repo file size_gb category desc; do
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
if [[ "$local_name" == "$file" ]]; then
found_in_catalog=true
echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true
break
fi
done < "$PROJECT_ROOT/configs/models.conf"
if $found_in_catalog && ! $matched; then
log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)"
continue
fi
fi
MODEL_PATHS+=("$p")
done
if (( ${#MODEL_PATHS[@]} == 0 )); then
log_error "No models matched filters. Run: make benchmark-setup"
exit 1
fi
log_info "Models: ${#MODEL_PATHS[@]}"
# Start metric logging
METRICS_FILE="$RESULT_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
METRICS_PID=$!
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
# Run benchmarks (same logic as run-baseline.sh)
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for BACKEND in "${available_backends[@]}"; do
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
BACKEND_SAFE="${BACKEND//[.-]/_}"
ENV_ARGS=()
[[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
# Standard test
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log"
if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard${RESET}\n" "$BACKEND" "$MODEL_NAME"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD")
if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Done"; tail -3 "$OUT"
else
log_error "Failed"; echo "FAILED" >> "$OUT"
fi
fi
# Long-context test
if $SKIP_LONGCTX; then
continue
fi
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
if [[ ! -s "$OUT_LC" ]]; then
printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1
-p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX")
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Done"; tail -3 "$OUT_LC"
else
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
fi
fi
done
done
# Parse results
SUMMARY="$RESULT_DIR/summary.json"
# Parse llama-bench log files into summary JSON
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
import sys, os, re, json
from pathlib import Path
result_dir = Path(sys.argv[1])
results = []
for logfile in sorted(result_dir.glob("*.log")):
content = logfile.read_text()
if "FAILED" in content:
continue
for line in content.splitlines():
line = line.strip()
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
continue
if "---" in line:
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) < 10:
continue
try:
test_type = parts[8].strip()
ts_raw = parts[9].strip()
ts_match = re.match(r'([\d.]+)', ts_raw)
if not ts_match:
continue
results.append({
"file": logfile.name,
"model": parts[1].strip(),
"size": parts[2].strip(),
"backend": parts[4].strip(),
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"raw": ts_raw,
})
except (ValueError, IndexError):
continue
print(json.dumps({"results": results}, indent=2))
PYEOF
log_header "Results"
python3 - "$SUMMARY" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
data = json.load(f)
if not data["results"]:
print(" No results parsed.")
sys.exit(0)
fmt = " {:<20} {:<16} {:<8} {:>10}"
print(fmt.format("Model", "Backend", "Test", "t/s"))
print(" " + "-" * 58)
for r in data["results"]:
print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}"))
PYEOF
echo ""
log_success "Results saved to: $RESULT_DIR"