#!/usr/bin/env bash # Capture pre-optimization baseline benchmark set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" source "$SCRIPT_DIR/../../lib/detect.sh" source "$SCRIPT_DIR/../../lib/format.sh" MODEL_DIR="$(data_dir models)" TS="$(timestamp)" RESULT_DIR="$(data_dir baselines)/$TS" mkdir -p "$RESULT_DIR" REPS_STANDARD=5 REPS_LONGCTX=3 SKIP_LONGCTX=false MAX_SIZE_GB=0 # 0 = no limit CATEGORY_FILTER="" CTX_DEPTH=32768 CTX_PROMPT=2048 PP_TOKENS=512 TG_TOKENS=128 BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048) KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0) while [[ $# -gt 0 ]]; do case "$1" in --skip-longctx) SKIP_LONGCTX=true; shift ;; --max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;; --category|-c) CATEGORY_FILTER="$2"; shift 2 ;; --reps|-r) REPS_STANDARD="$2"; shift 2 ;; --context|-d) CTX_DEPTH="$2"; shift 2 ;; --pp) PP_TOKENS="$2"; shift 2 ;; --tg) TG_TOKENS="$2"; shift 2 ;; -b|--batch) BATCH_SIZE="$2"; shift 2 ;; --kv-types) KV_TYPES_RAW="$2"; shift 2 ;; --help|-h) echo "Usage: run-baseline.sh [OPTIONS]" echo "" echo "Options:" echo " --skip-longctx Skip long-context tests" echo " --max-size GB Only bench models up to this file size in GB" echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)" echo " --reps N Standard test repetitions (default: 5)" echo " --context N Long-context depth in tokens (default: 32768)" echo " --pp N Prompt processing tokens (default: 512)" echo " --tg N Token generation count (default: 128)" echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)" echo " --kv-types LIST KV cache sweep: comma-separated types to test" echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE" echo " Types: f16, q8_0, q4_0, q4_1" echo "" echo "Examples:" echo " run-baseline.sh --max-size 20 # Only models ≤20 GB" echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE" echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic" echo " run-baseline.sh --kv-types f16,q8_0,q4_0 --context 131072 # KV sweep" echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run" exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; esac done # Scale prompt tokens for large contexts: ~1/16 of depth, min 512 if (( CTX_DEPTH > 32768 )); then CTX_PROMPT=$(( CTX_DEPTH / 16 )) (( CTX_PROMPT < 512 )) && CTX_PROMPT=512 fi # Parse KV cache types for sweep if [[ -n "$KV_TYPES_RAW" ]]; then IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW" else KV_TYPES=("f16") fi log_header "Baseline Benchmark Capture" log_info "Results will be saved to: $RESULT_DIR" $SKIP_LONGCTX && log_info "Long-context tests: SKIPPED" (( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB" [[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER" (( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}" # ── 1. Save system state ──────────────────────────────── log_info "Capturing system state..." bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null # ── 2. Discover available toolboxes ───────────────────── existing="$(detect_toolbox_names 2>/dev/null || true)" declare -A BENCH_PATHS=( [llama-vulkan-radv]="/usr/sbin/llama-bench" [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" [llama-rocm-6.4.4]="/usr/local/bin/llama-bench" [llama-rocm-7.2]="/usr/local/bin/llama-bench" [llama-rocm7-nightlies]="/usr/local/bin/llama-bench" ) available_backends=() for tb in "${!BENCH_PATHS[@]}"; do if echo "$existing" | grep -q "^${tb}$"; then available_backends+=("$tb") log_success "Backend: $tb" fi done if (( ${#available_backends[@]} == 0 )); then log_error "No toolbox backends found. Run: make benchmark-setup" exit 1 fi # ── 3. Discover and filter models ─────────────────────── mapfile -t ALL_MODEL_PATHS < <( find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ | sort ) MODEL_PATHS=() for p in "${ALL_MODEL_PATHS[@]}"; do # Size filter if (( MAX_SIZE_GB > 0 )); then file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0) file_size_gb=$(( file_size_bytes / 1073741824 )) if (( file_size_gb >= MAX_SIZE_GB )); then log_info "Skipping $(basename "$p") ($(( file_size_gb )) GB > ${MAX_SIZE_GB} GB limit)" continue fi fi # Category filter (match against models.conf if available) if [[ -n "$CATEGORY_FILTER" ]]; then local_name="$(basename "$p")" matched=false if [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then while IFS='|' read -r name repo file size_gb category desc; do [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue if [[ "$local_name" == "$file" ]]; then if echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category"; then matched=true fi break fi done < "$PROJECT_ROOT/configs/models.conf" fi # If model not in catalog, include it (don't filter unknowns) if ! $matched; then # Check if we found it in catalog at all found_in_catalog=false while IFS='|' read -r name repo file size_gb category desc; do [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue [[ "$local_name" == "$file" ]] && found_in_catalog=true && break done < "$PROJECT_ROOT/configs/models.conf" if $found_in_catalog; then log_info "Skipping $(basename "$p") (category not in: $CATEGORY_FILTER)" continue fi fi fi MODEL_PATHS+=("$p") done if (( ${#MODEL_PATHS[@]} == 0 )); then log_error "No models matched filters. Adjust --max-size or --category" exit 1 fi log_info "Benchmarking ${#MODEL_PATHS[@]} model(s):" for p in "${MODEL_PATHS[@]}"; do file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0) printf " %s (%.1f GB)\n" "$(basename "$p")" "$(echo "scale=1; $file_size_bytes / 1073741824" | bc)" done # ── 4. Start metric logging ───────────────────────────── METRICS_FILE="$RESULT_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & METRICS_PID=$! log_info "Metric logger started (PID: $METRICS_PID)" cleanup() { kill "$METRICS_PID" 2>/dev/null || true wait "$METRICS_PID" 2>/dev/null || true } trap 'cleanup; exit 0' EXIT # ── 5. Run benchmarks ─────────────────────────────────── for MODEL_PATH in "${MODEL_PATHS[@]}"; do MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" for BACKEND in "${available_backends[@]}"; do BENCH_BIN="${BENCH_PATHS[$BACKEND]}" BACKEND_SAFE="${BACKEND//[.-]/_}" # Build environment args for ROCm backends ENV_ARGS=() if [[ "$BACKEND" == *rocm* ]]; then ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) fi # Resolve model path for toolbox (host paths need /run/host/ prefix) TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")" if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" fi for KV_SPEC in "${KV_TYPES[@]}"; do # Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0 if [[ "$KV_SPEC" == *:* ]]; then KV_K="${KV_SPEC%%:*}" KV_V="${KV_SPEC##*:}" else KV_K="$KV_SPEC" KV_V="$KV_SPEC" fi # Build KV cache args (skip for f16 — it's the default) KV_ARGS=() KV_SUFFIX="" if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V") KV_SUFFIX="__kv_${KV_K}-${KV_V}" fi # Build batch size args BATCH_ARGS=() BATCH_SUFFIX="" if [[ -n "$BATCH_SIZE" ]]; then BATCH_ARGS+=(-b "$BATCH_SIZE") BATCH_SUFFIX="__b${BATCH_SIZE}" fi # Standard test local_suffix="fa1" [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT" ]]; then printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}") printf " cmd: %s\n" "${CMD[*]}" if "${CMD[@]}" > "$OUT" 2>&1; then log_success "Standard test complete" tail -5 "$OUT" else log_error "Standard test failed (exit $?)" echo "FAILED" >> "$OUT" fi else log_info "Skipping standard test (log exists): $OUT" fi # Long-context test if $SKIP_LONGCTX; then continue fi OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}" UB_SIZE=2048 [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}") printf " cmd: %s\n" "${CMD_LC[*]}" if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then log_success "Long-context test complete" tail -5 "$OUT_LC" else log_error "Long-context test failed (exit $?)" echo "FAILED" >> "$OUT_LC" fi else log_info "Skipping long-context test (log exists): $OUT_LC" fi done # KV_TYPES done done # ── 6. Parse results into summary JSON ────────────────── log_info "Parsing results..." SUMMARY="$RESULT_DIR/summary.json" python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF' import sys, os, re, json from pathlib import Path result_dir = Path(sys.argv[1]) results = [] for logfile in sorted(result_dir.glob("*.log")): content = logfile.read_text() if "FAILED" in content: continue # Extract KV cache type from filename (__kv_q8_0-q8_0) kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name) kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16" for line in content.splitlines(): line = line.strip() if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()): continue if "---" in line: continue parts = [p.strip() for p in line.split("|")] # Filter out empty parts from leading/trailing pipes data = [p for p in parts if p and "---" not in p] if len(data) < 6: continue try: # test and t/s are always the last two columns test_type = data[-2] ts_raw = data[-1] if not test_type or not ts_raw: continue ts_match = re.match(r'([\d.]+)', ts_raw) if not ts_match: continue results.append({ "file": logfile.name, "model": data[0], "size": data[1], "backend": data[3], "test": test_type, "tokens_per_sec": float(ts_match.group(1)), "kv_cache": kv_type, "raw": ts_raw, }) except (ValueError, IndexError): continue print(json.dumps({"results": results}, indent=2)) PYEOF # ── 7. Display summary ────────────────────────────────── log_header "Baseline Results" python3 - "$SUMMARY" << 'PYEOF' import sys, json with open(sys.argv[1]) as f: data = json.load(f) if not data["results"]: print(" No results parsed. Check log files for errors.") sys.exit(0) fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}" print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s")) print(" " + "-" * 68) for r in data["results"]: print(fmt.format( r["model"][:20], r["backend"][:16], r.get("kv_cache", "f16/f16")[:10], r["test"], f"{r['tokens_per_sec']:.2f}" )) PYEOF echo "" log_success "Baseline saved to: $RESULT_DIR" log_info "Files: system-state.json, summary.json, metrics.csv, *.log" log_info "Compare later with: bin/benchmark compare $RESULT_DIR "