Files
strix-halo-optimizations/scripts/benchmark/compare.sh
Felipe Cardoso e9cb5c491f fix+test: improve test suite, fix 2 bugs found by tests
Bugs fixed in production code:
- compare.sh: Python truthiness on 0.0 — `if b_val` was False for 0.0 t/s,
  displaying it as a dash instead of "0.0". Fixed with `is not None` checks.
- compare.sh: ZeroDivisionError when computing delta % with 0.0 baseline.

Test improvements (review findings):
- detect.bats: kernel param tests now use real detect_kernel_param logic
  pattern (not a separate reimplementation). Added non-GiB-aligned RAM test,
  device ID without 0x prefix, empty firmware version, llama-bench detection,
  detect_total_physical_ram_kb tests.
- benchmark_compare.bats: assert delta percentages (+20.0%, -25.0%, 0.0%),
  test 0.0 t/s edge case, test per-directory error messages, test config
  change detection with specific field assertions.
- log_metrics.bats: add assert_success, --help test, timestamp format
  validation. Remove unused mock sysfs setup.
- common.bats: fix data_dir test, remove redundant assertion, add cleanup.
- test_helper.sh: remove unused FIXTURES_DIR.
- Remove empty tests/fixtures/ directory.

94 tests, all passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 22:22:41 +01:00

141 lines
4.0 KiB
Bash

#!/usr/bin/env bash
# Compare two benchmark runs side-by-side
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
if [[ $# -lt 2 ]]; then
echo "Usage: benchmark compare <before-dir> <after-dir>"
echo ""
echo "Examples:"
echo " bin/benchmark compare data/baselines/20260325-120000 data/benchmarks/post-opt-20260326-100000"
echo ""
echo "Available baselines:"
ls -d "$(data_dir baselines)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)"
echo ""
echo "Available benchmark runs:"
ls -d "$(data_dir benchmarks)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)"
exit 1
fi
BEFORE_DIR="$1"
AFTER_DIR="$2"
for d in "$BEFORE_DIR" "$AFTER_DIR"; do
if [[ ! -f "$d/summary.json" ]]; then
log_error "No summary.json in $d"
exit 1
fi
done
log_header "Benchmark Comparison"
# Extract timestamps from directory names
before_name="$(basename "$BEFORE_DIR")"
after_name="$(basename "$AFTER_DIR")"
log_info "Before: $before_name"
log_info "After: $after_name"
# Show system state diff if available
if [[ -f "$BEFORE_DIR/system-state.json" ]] && [[ -f "$AFTER_DIR/system-state.json" ]]; then
echo ""
python3 - "$BEFORE_DIR/system-state.json" "$AFTER_DIR/system-state.json" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
before = json.load(f)
with open(sys.argv[2]) as f:
after = json.load(f)
changes = []
# Check key config differences
b_mem = before.get("memory", {})
a_mem = after.get("memory", {})
if b_mem.get("vram_total_bytes") != a_mem.get("vram_total_bytes"):
bv = b_mem.get("vram_total_bytes", 0) / 2**30
av = a_mem.get("vram_total_bytes", 0) / 2**30
changes.append(f" VRAM: {bv:.1f} GiB -> {av:.1f} GiB")
if b_mem.get("gtt_total_bytes") != a_mem.get("gtt_total_bytes"):
bg = b_mem.get("gtt_total_bytes", 0) / 2**30
ag = a_mem.get("gtt_total_bytes", 0) / 2**30
changes.append(f" GTT: {bg:.1f} GiB -> {ag:.1f} GiB")
b_kern = before.get("kernel", {})
a_kern = after.get("kernel", {})
for param in ["param_iommu", "param_gttsize", "param_pages_limit"]:
bv = b_kern.get(param, "")
av = a_kern.get(param, "")
if bv != av:
changes.append(f" {param}: '{bv}' -> '{av}'")
bt = before.get("tuned_profile", "")
at = after.get("tuned_profile", "")
if bt != at:
changes.append(f" tuned: {bt} -> {at}")
if changes:
print(" Configuration changes:")
for c in changes:
print(c)
else:
print(" No configuration changes detected")
PYEOF
fi
# Compare results
echo ""
python3 - "$BEFORE_DIR/summary.json" "$AFTER_DIR/summary.json" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
before = json.load(f)
with open(sys.argv[2]) as f:
after = json.load(f)
# Index by (model, backend, test)
def index_results(data):
idx = {}
for r in data.get("results", []):
key = (r["model"], r["backend"], r["test"])
idx[key] = r["tokens_per_sec"]
return idx
b_idx = index_results(before)
a_idx = index_results(after)
all_keys = sorted(set(b_idx.keys()) | set(a_idx.keys()))
if not all_keys:
print(" No comparable results found.")
sys.exit(0)
fmt = " {:<18} {:<14} {:<7} {:>9} {:>9} {:>8}"
print(fmt.format("Model", "Backend", "Test", "Before", "After", "Delta"))
print(" " + "-" * 70)
for key in all_keys:
model, backend, test = key
b_val = b_idx.get(key)
a_val = a_idx.get(key)
b_str = f"{b_val:.1f}" if b_val is not None else "—"
a_str = f"{a_val:.1f}" if a_val is not None else "—"
if b_val is not None and a_val is not None:
delta_pct = (a_val - b_val) / b_val * 100 if b_val != 0 else (float('inf') if a_val > 0 else 0)
if delta_pct > 0:
d_str = f"\033[32m+{delta_pct:.1f}%\033[0m"
elif delta_pct < 0:
d_str = f"\033[31m{delta_pct:.1f}%\033[0m"
else:
d_str = "0.0%"
else:
d_str = "—"
print(fmt.format(model[:18], backend[:14], test, b_str, a_str, d_str))
print()
PYEOF