strix-halo-optimizations/scripts/benchmark/compare.sh

#!/usr/bin/env bash
# Compare two benchmark runs side-by-side
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/format.sh"

if [[ $# -lt 2 ]]; then
    echo "Usage: benchmark compare <before-dir> <after-dir>"
    echo ""
    echo "Examples:"
    echo "  bin/benchmark compare data/baselines/20260325-120000 data/benchmarks/post-opt-20260326-100000"
    echo ""
    echo "Available baselines:"
    ls -d "$(data_dir baselines)"/*/ 2>/dev/null | sed 's|/$||' || echo "  (none)"
    echo ""
    echo "Available benchmark runs:"
    ls -d "$(data_dir benchmarks)"/*/ 2>/dev/null | sed 's|/$||' || echo "  (none)"
    exit 1
fi

BEFORE_DIR="$1"
AFTER_DIR="$2"

for d in "$BEFORE_DIR" "$AFTER_DIR"; do
    if [[ ! -f "$d/summary.json" ]]; then
        log_error "No summary.json in $d"
        exit 1
    fi
done

log_header "Benchmark Comparison"

# Extract timestamps from directory names
before_name="$(basename "$BEFORE_DIR")"
after_name="$(basename "$AFTER_DIR")"
log_info "Before: $before_name"
log_info "After:  $after_name"

# Show system state diff if available
if [[ -f "$BEFORE_DIR/system-state.json" ]] && [[ -f "$AFTER_DIR/system-state.json" ]]; then
    echo ""
    python3 - "$BEFORE_DIR/system-state.json" "$AFTER_DIR/system-state.json" << 'PYEOF'
import sys, json

with open(sys.argv[1]) as f:
    before = json.load(f)
with open(sys.argv[2]) as f:
    after = json.load(f)

changes = []
# Check key config differences
b_mem = before.get("memory", {})
a_mem = after.get("memory", {})
if b_mem.get("vram_total_bytes") != a_mem.get("vram_total_bytes"):
    bv = b_mem.get("vram_total_bytes", 0) / 2**30
    av = a_mem.get("vram_total_bytes", 0) / 2**30
    changes.append(f"  VRAM: {bv:.1f} GiB -> {av:.1f} GiB")
if b_mem.get("gtt_total_bytes") != a_mem.get("gtt_total_bytes"):
    bg = b_mem.get("gtt_total_bytes", 0) / 2**30
    ag = a_mem.get("gtt_total_bytes", 0) / 2**30
    changes.append(f"  GTT:  {bg:.1f} GiB -> {ag:.1f} GiB")

b_kern = before.get("kernel", {})
a_kern = after.get("kernel", {})
for param in ["param_iommu", "param_gttsize", "param_pages_limit"]:
    bv = b_kern.get(param, "")
    av = a_kern.get(param, "")
    if bv != av:
        changes.append(f"  {param}: '{bv}' -> '{av}'")

bt = before.get("tuned_profile", "")
at = after.get("tuned_profile", "")
if bt != at:
    changes.append(f"  tuned: {bt} -> {at}")

if changes:
    print("  Configuration changes:")
    for c in changes:
        print(c)
else:
    print("  No configuration changes detected")
PYEOF
fi

# Compare results
echo ""
python3 - "$BEFORE_DIR/summary.json" "$AFTER_DIR/summary.json" << 'PYEOF'
import sys, json

with open(sys.argv[1]) as f:
    before = json.load(f)
with open(sys.argv[2]) as f:
    after = json.load(f)

# Index by (model, backend, test)
def index_results(data):
    idx = {}
    for r in data.get("results", []):
        key = (r["model"], r["backend"], r["test"])
        idx[key] = r["tokens_per_sec"]
    return idx

b_idx = index_results(before)
a_idx = index_results(after)

all_keys = sorted(set(b_idx.keys()) | set(a_idx.keys()))

if not all_keys:
    print("  No comparable results found.")
    sys.exit(0)

fmt = "  {:<18} {:<14} {:<7} {:>9} {:>9} {:>8}"
print(fmt.format("Model", "Backend", "Test", "Before", "After", "Delta"))
print("  " + "-" * 70)

for key in all_keys:
    model, backend, test = key
    b_val = b_idx.get(key)
    a_val = a_idx.get(key)

    b_str = f"{b_val:.1f}" if b_val else "—"
    a_str = f"{a_val:.1f}" if a_val else "—"

    if b_val and a_val:
        delta_pct = (a_val - b_val) / b_val * 100
        if delta_pct > 0:
            d_str = f"\033[32m+{delta_pct:.1f}%\033[0m"
        elif delta_pct < 0:
            d_str = f"\033[31m{delta_pct:.1f}%\033[0m"
        else:
            d_str = "0.0%"
    else:
        d_str = "—"

    print(fmt.format(model[:18], backend[:14], test, b_str, a_str, d_str))

print()
PYEOF