Files
strix-halo-optimizations/scripts/benchmark/run-baseline.sh
Felipe Cardoso cb25fa3f6f feat: add benchmark filtering (--max-size, --category, --skip-longctx)
Both run-baseline.sh and run-suite.sh now support:
- --max-size GB: skip models larger than N GB (prevents OOM)
- --category LIST: filter by catalog category (smoke,dense,moe)
- --skip-longctx: skip 32K context tests (saves time + memory)
- --reps N: configure repetition count
- --help: shows usage with examples

Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx
Full post-optimization: benchmark baseline (no filters, all models + longctx)

Also: 4 new BATS tests for flag parsing (98 total, all passing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 19:07:24 +01:00

299 lines
10 KiB
Bash

#!/usr/bin/env bash
# Capture pre-optimization baseline benchmark
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
MODEL_DIR="$(data_dir models)"
TS="$(timestamp)"
RESULT_DIR="$(data_dir baselines)/$TS"
mkdir -p "$RESULT_DIR"
REPS_STANDARD=5
REPS_LONGCTX=3
SKIP_LONGCTX=false
MAX_SIZE_GB=0 # 0 = no limit
CATEGORY_FILTER=""
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-longctx) SKIP_LONGCTX=true; shift ;;
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
--help|-h)
echo "Usage: run-baseline.sh [OPTIONS]"
echo ""
echo "Options:"
echo " --skip-longctx Skip long-context (32K) tests"
echo " --max-size GB Only bench models up to this file size in GB"
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
echo " --reps N Standard test repetitions (default: 5)"
echo ""
echo "Examples:"
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
echo " run-baseline.sh --category smoke,moe # Only smoke + MoE models"
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;;
esac
done
log_header "Baseline Benchmark Capture"
log_info "Results will be saved to: $RESULT_DIR"
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
# ── 1. Save system state ────────────────────────────────
log_info "Capturing system state..."
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
# ── 2. Discover available toolboxes ─────────────────────
existing="$(detect_toolbox_names 2>/dev/null || true)"
declare -A BENCH_PATHS=(
[llama-vulkan-radv]="/usr/sbin/llama-bench"
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
)
available_backends=()
for tb in "${!BENCH_PATHS[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
available_backends+=("$tb")
log_success "Backend: $tb"
fi
done
if (( ${#available_backends[@]} == 0 )); then
log_error "No toolbox backends found. Run: make benchmark-setup"
exit 1
fi
# ── 3. Discover and filter models ───────────────────────
mapfile -t ALL_MODEL_PATHS < <(
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
MODEL_PATHS=()
for p in "${ALL_MODEL_PATHS[@]}"; do
# Size filter
if (( MAX_SIZE_GB > 0 )); then
file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
file_size_gb=$(( file_size_bytes / 1073741824 ))
if (( file_size_gb >= MAX_SIZE_GB )); then
log_info "Skipping $(basename "$p") ($(( file_size_gb )) GB > ${MAX_SIZE_GB} GB limit)"
continue
fi
fi
# Category filter (match against models.conf if available)
if [[ -n "$CATEGORY_FILTER" ]]; then
local_name="$(basename "$p")"
matched=false
if [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
while IFS='|' read -r name repo file size_gb category desc; do
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
if [[ "$local_name" == "$file" ]]; then
if echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category"; then
matched=true
fi
break
fi
done < "$PROJECT_ROOT/configs/models.conf"
fi
# If model not in catalog, include it (don't filter unknowns)
if ! $matched; then
# Check if we found it in catalog at all
found_in_catalog=false
while IFS='|' read -r name repo file size_gb category desc; do
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
[[ "$local_name" == "$file" ]] && found_in_catalog=true && break
done < "$PROJECT_ROOT/configs/models.conf"
if $found_in_catalog; then
log_info "Skipping $(basename "$p") (category not in: $CATEGORY_FILTER)"
continue
fi
fi
fi
MODEL_PATHS+=("$p")
done
if (( ${#MODEL_PATHS[@]} == 0 )); then
log_error "No models matched filters. Adjust --max-size or --category"
exit 1
fi
log_info "Benchmarking ${#MODEL_PATHS[@]} model(s):"
for p in "${MODEL_PATHS[@]}"; do
file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
printf " %s (%.1f GB)\n" "$(basename "$p")" "$(echo "scale=1; $file_size_bytes / 1073741824" | bc)"
done
# ── 4. Start metric logging ─────────────────────────────
METRICS_FILE="$RESULT_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
METRICS_PID=$!
log_info "Metric logger started (PID: $METRICS_PID)"
cleanup() {
kill "$METRICS_PID" 2>/dev/null || true
wait "$METRICS_PID" 2>/dev/null || true
}
trap cleanup EXIT
# ── 5. Run benchmarks ───────────────────────────────────
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for BACKEND in "${available_backends[@]}"; do
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
BACKEND_SAFE="${BACKEND//[.-]/_}"
# Build environment args for ROCm backends
ENV_ARGS=()
if [[ "$BACKEND" == *rocm* ]]; then
ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
fi
# Standard test (pp512 + tg128, default context)
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log"
if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD")
printf " cmd: %s\n" "${CMD[*]}"
if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Standard test complete"
tail -5 "$OUT"
else
log_error "Standard test failed (exit $?)"
echo "FAILED" >> "$OUT"
fi
else
log_info "Skipping standard test (log exists): $OUT"
fi
# Long-context test (pp2048, tg32, ctx 32768)
if $SKIP_LONGCTX; then
continue
fi
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
if [[ ! -s "$OUT_LC" ]]; then
printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
UB_SIZE=2048
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1
-p 2048 -n 32 -d 32768 -ub "$UB_SIZE"
-r "$REPS_LONGCTX")
printf " cmd: %s\n" "${CMD_LC[*]}"
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Long-context test complete"
tail -5 "$OUT_LC"
else
log_error "Long-context test failed (exit $?)"
echo "FAILED" >> "$OUT_LC"
fi
else
log_info "Skipping long-context test (log exists): $OUT_LC"
fi
done
done
# ── 6. Parse results into summary JSON ──────────────────
log_info "Parsing results..."
SUMMARY="$RESULT_DIR/summary.json"
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
import sys, os, re, json
from pathlib import Path
result_dir = Path(sys.argv[1])
results = []
for logfile in sorted(result_dir.glob("*.log")):
content = logfile.read_text()
if "FAILED" in content:
continue
for line in content.splitlines():
line = line.strip()
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
continue
if "---" in line:
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) < 10:
continue
try:
test_type = parts[8].strip() if len(parts) > 8 else ""
ts_raw = parts[9].strip() if len(parts) > 9 else ""
if not test_type or not ts_raw:
continue
ts_match = re.match(r'([\d.]+)', ts_raw)
if not ts_match:
continue
results.append({
"file": logfile.name,
"model": parts[1].strip(),
"size": parts[2].strip(),
"backend": parts[4].strip(),
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"raw": ts_raw,
})
except (ValueError, IndexError):
continue
print(json.dumps({"results": results}, indent=2))
PYEOF
# ── 7. Display summary ──────────────────────────────────
log_header "Baseline Results"
python3 - "$SUMMARY" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
data = json.load(f)
if not data["results"]:
print(" No results parsed. Check log files for errors.")
sys.exit(0)
fmt = " {:<20} {:<16} {:<8} {:>10}"
print(fmt.format("Model", "Backend", "Test", "t/s"))
print(" " + "-" * 58)
for r in data["results"]:
print(fmt.format(
r["model"][:20],
r["backend"][:16],
r["test"],
f"{r['tokens_per_sec']:.2f}"
))
PYEOF
echo ""
log_success "Baseline saved to: $RESULT_DIR"
log_info "Files: system-state.json, summary.json, metrics.csv, *.log"
log_info "Compare later with: bin/benchmark compare $RESULT_DIR <new-run-dir>"