#!/usr/bin/env bash # Capture pre-optimization baseline benchmark set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" source "$SCRIPT_DIR/../../lib/detect.sh" source "$SCRIPT_DIR/../../lib/format.sh" MODEL_DIR="$(data_dir models)" TS="$(timestamp)" RESULT_DIR="$(data_dir baselines)/$TS" mkdir -p "$RESULT_DIR" REPS_STANDARD=5 REPS_LONGCTX=3 log_header "Baseline Benchmark Capture" log_info "Results will be saved to: $RESULT_DIR" # ── 1. Save system state ──────────────────────────────── log_info "Capturing system state..." bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null # ── 2. Discover available toolboxes and models ────────── existing="$(detect_toolbox_names 2>/dev/null || true)" # Map toolbox names to llama-bench commands (same pattern as upstream) declare -A BENCH_PATHS=( [llama-vulkan-radv]="/usr/sbin/llama-bench" [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" [llama-rocm-6.4.4]="/usr/local/bin/llama-bench" [llama-rocm-7.2]="/usr/local/bin/llama-bench" [llama-rocm7-nightlies]="/usr/local/bin/llama-bench" ) available_backends=() for tb in "${!BENCH_PATHS[@]}"; do if echo "$existing" | grep -q "^${tb}$"; then available_backends+=("$tb") log_success "Backend: $tb" fi done if (( ${#available_backends[@]} == 0 )); then log_error "No toolbox backends found. Run: make benchmark-setup" exit 1 fi # Find models mapfile -t MODEL_PATHS < <( find "$MODEL_DIR" -type f -name '*.gguf' \ \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ | sort ) if (( ${#MODEL_PATHS[@]} == 0 )); then log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup" exit 1 fi log_info "Found ${#MODEL_PATHS[@]} model(s):" for p in "${MODEL_PATHS[@]}"; do printf " %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)" done # ── 3. Start metric logging ───────────────────────────── METRICS_FILE="$RESULT_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & METRICS_PID=$! log_info "Metric logger started (PID: $METRICS_PID)" cleanup() { kill "$METRICS_PID" 2>/dev/null || true wait "$METRICS_PID" 2>/dev/null || true } trap cleanup EXIT # ── 4. Run benchmarks ─────────────────────────────────── for MODEL_PATH in "${MODEL_PATHS[@]}"; do MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" for BACKEND in "${available_backends[@]}"; do BENCH_BIN="${BENCH_PATHS[$BACKEND]}" BACKEND_SAFE="${BACKEND//[.-]/_}" # Build environment args for ROCm backends ENV_ARGS=() if [[ "$BACKEND" == *rocm* ]]; then ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) fi # Standard test (pp512 + tg128, default context) OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" if [[ ! -s "$OUT" ]]; then printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD") printf " cmd: %s\n" "${CMD[*]}" if "${CMD[@]}" > "$OUT" 2>&1; then log_success "Standard test complete" tail -5 "$OUT" else log_error "Standard test failed (exit $?)" echo "FAILED" >> "$OUT" fi else log_info "Skipping standard test (log exists): $OUT" fi # Long-context test (pp2048, tg32, ctx 32768) OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME" UB_SIZE=2048 [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX") printf " cmd: %s\n" "${CMD_LC[*]}" if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then log_success "Long-context test complete" tail -5 "$OUT_LC" else log_error "Long-context test failed (exit $?)" echo "FAILED" >> "$OUT_LC" fi else log_info "Skipping long-context test (log exists): $OUT_LC" fi done done # ── 5. Parse results into summary JSON ────────────────── log_info "Parsing results..." SUMMARY="$RESULT_DIR/summary.json" python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF' import sys, os, re, json from pathlib import Path result_dir = Path(sys.argv[1]) results = [] for logfile in sorted(result_dir.glob("*.log")): content = logfile.read_text() if "FAILED" in content: continue # Parse the pipe-delimited llama-bench table for line in content.splitlines(): line = line.strip() if not line.startswith("|") or "model" in line.lower() and "size" in line.lower(): continue if "---" in line: continue parts = [p.strip() for p in line.split("|")] if len(parts) < 10: continue # Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s | try: test_type = parts[8].strip() if len(parts) > 8 else "" ts_raw = parts[9].strip() if len(parts) > 9 else "" if not test_type or not ts_raw: continue # Parse "548.18 +/- 1.59" or just "548.18" ts_match = re.match(r'([\d.]+)', ts_raw) if not ts_match: continue results.append({ "file": logfile.name, "model": parts[1].strip(), "size": parts[2].strip(), "backend": parts[4].strip(), "test": test_type, "tokens_per_sec": float(ts_match.group(1)), "raw": ts_raw, }) except (ValueError, IndexError): continue print(json.dumps({"results": results}, indent=2)) PYEOF # ── 6. Display summary ────────────────────────────────── log_header "Baseline Results" python3 - "$SUMMARY" << 'PYEOF' import sys, json with open(sys.argv[1]) as f: data = json.load(f) if not data["results"]: print(" No results parsed. Check log files for errors.") sys.exit(0) # Print table fmt = " {:<20} {:<16} {:<8} {:>10}" print(fmt.format("Model", "Backend", "Test", "t/s")) print(" " + "-" * 58) for r in data["results"]: print(fmt.format( r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}" )) PYEOF echo "" log_success "Baseline saved to: $RESULT_DIR" log_info "Files: system-state.json, summary.json, metrics.csv, *.log" log_info "Compare later with: bin/benchmark compare $RESULT_DIR "