#!/usr/bin/env bash # Full benchmark suite — run all backends × models with tagging set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" source "$SCRIPT_DIR/../../lib/detect.sh" source "$SCRIPT_DIR/../../lib/format.sh" MODEL_DIR="$(data_dir models)" TAG="run" BACKENDS_FILTER="" MODELS_FILTER="" while [[ $# -gt 0 ]]; do case "$1" in --tag|-t) TAG="$2"; shift 2 ;; --backends|-b) BACKENDS_FILTER="$2"; shift 2 ;; --models|-m) MODELS_FILTER="$2"; shift 2 ;; *) log_warn "Unknown argument: $1"; shift ;; esac done TS="$(timestamp)" RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}" mkdir -p "$RESULT_DIR" REPS_STANDARD=5 REPS_LONGCTX=3 log_header "Benchmark Suite: $TAG" log_info "Results: $RESULT_DIR" # Save system state bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null # Discover backends existing="$(detect_toolbox_names 2>/dev/null || true)" declare -A BENCH_PATHS=( [llama-vulkan-radv]="/usr/sbin/llama-bench" [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" [llama-rocm-6.4.4]="/usr/local/bin/llama-bench" [llama-rocm-7.2]="/usr/local/bin/llama-bench" [llama-rocm7-nightlies]="/usr/local/bin/llama-bench" ) available_backends=() for tb in "${!BENCH_PATHS[@]}"; do if echo "$existing" | grep -q "^${tb}$"; then if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then available_backends+=("$tb") fi fi done if (( ${#available_backends[@]} == 0 )); then log_error "No matching backends. Run: make benchmark-setup" exit 1 fi log_info "Backends: ${available_backends[*]}" # Find models mapfile -t MODEL_PATHS < <( find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ | sort ) if [[ -n "$MODELS_FILTER" ]]; then filtered=() for p in "${MODEL_PATHS[@]}"; do name="$(basename "$p")" if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then filtered+=("$p") fi done MODEL_PATHS=("${filtered[@]}") fi if (( ${#MODEL_PATHS[@]} == 0 )); then log_error "No models found. Run: make benchmark-setup" exit 1 fi log_info "Models: ${#MODEL_PATHS[@]}" # Start metric logging METRICS_FILE="$RESULT_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & METRICS_PID=$! trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT # Run benchmarks (same logic as run-baseline.sh) for MODEL_PATH in "${MODEL_PATHS[@]}"; do MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" for BACKEND in "${available_backends[@]}"; do BENCH_BIN="${BENCH_PATHS[$BACKEND]}" BACKEND_SAFE="${BACKEND//[.-]/_}" ENV_ARGS=() [[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) # Standard test OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" if [[ ! -s "$OUT" ]]; then printf "\n${BOLD}>> [%s] %s — standard${RESET}\n" "$BACKEND" "$MODEL_NAME" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD") if "${CMD[@]}" > "$OUT" 2>&1; then log_success "Done"; tail -3 "$OUT" else log_error "Failed"; echo "FAILED" >> "$OUT" fi fi # Long-context test OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME" UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX") if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then log_success "Done"; tail -3 "$OUT_LC" else log_error "Failed"; echo "FAILED" >> "$OUT_LC" fi fi done done # Parse results SUMMARY="$RESULT_DIR/summary.json" # Parse llama-bench log files into summary JSON python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF' import sys, os, re, json from pathlib import Path result_dir = Path(sys.argv[1]) results = [] for logfile in sorted(result_dir.glob("*.log")): content = logfile.read_text() if "FAILED" in content: continue for line in content.splitlines(): line = line.strip() if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()): continue if "---" in line: continue parts = [p.strip() for p in line.split("|")] if len(parts) < 10: continue try: test_type = parts[8].strip() ts_raw = parts[9].strip() ts_match = re.match(r'([\d.]+)', ts_raw) if not ts_match: continue results.append({ "file": logfile.name, "model": parts[1].strip(), "size": parts[2].strip(), "backend": parts[4].strip(), "test": test_type, "tokens_per_sec": float(ts_match.group(1)), "raw": ts_raw, }) except (ValueError, IndexError): continue print(json.dumps({"results": results}, indent=2)) PYEOF log_header "Results" python3 - "$SUMMARY" << 'PYEOF' import sys, json with open(sys.argv[1]) as f: data = json.load(f) if not data["results"]: print(" No results parsed.") sys.exit(0) fmt = " {:<20} {:<16} {:<8} {:>10}" print(fmt.format("Model", "Backend", "Test", "t/s")) print(" " + "-" * 58) for r in data["results"]: print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}")) PYEOF echo "" log_success "Results saved to: $RESULT_DIR"