diff --git a/bin/benchmark b/bin/benchmark index ce0e400..4bdc124 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -11,10 +11,22 @@ case "${1:-help}" in compare) exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;; *) echo "Usage: benchmark [options]" + echo "" + echo "Commands:" echo " setup Ensure toolboxes and test models are ready" echo " baseline Capture pre-optimization baseline" - echo " run Run full benchmark suite (--tag NAME, --backends LIST)" + echo " run Run full benchmark suite" echo " compare Compare two runs (DIR1 DIR2)" + echo "" + echo "Filtering options (baseline and run):" + echo " --max-size GB Only models up to this file size" + echo " --category LIST Comma-separated: smoke,dense,moe" + echo " --skip-longctx Skip long-context (32K) tests" + echo " --reps N Standard test repetitions (default: 5)" + echo "" + echo "Examples:" + echo " benchmark baseline --max-size 20 --skip-longctx" + echo " benchmark run --tag post-opt --category moe" exit 1 ;; esac diff --git a/scripts/benchmark/run-baseline.sh b/scripts/benchmark/run-baseline.sh index 928e0e7..b93c057 100644 --- a/scripts/benchmark/run-baseline.sh +++ b/scripts/benchmark/run-baseline.sh @@ -14,18 +14,47 @@ mkdir -p "$RESULT_DIR" REPS_STANDARD=5 REPS_LONGCTX=3 +SKIP_LONGCTX=false +MAX_SIZE_GB=0 # 0 = no limit +CATEGORY_FILTER="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --skip-longctx) SKIP_LONGCTX=true; shift ;; + --max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;; + --category|-c) CATEGORY_FILTER="$2"; shift 2 ;; + --reps|-r) REPS_STANDARD="$2"; shift 2 ;; + --help|-h) + echo "Usage: run-baseline.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --skip-longctx Skip long-context (32K) tests" + echo " --max-size GB Only bench models up to this file size in GB" + echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)" + echo " --reps N Standard test repetitions (default: 5)" + echo "" + echo "Examples:" + echo " run-baseline.sh --max-size 20 # Only models ≤20 GB" + echo " run-baseline.sh --category smoke,moe # Only smoke + MoE models" + echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run" + exit 0 ;; + *) log_warn "Unknown argument: $1"; shift ;; + esac +done log_header "Baseline Benchmark Capture" log_info "Results will be saved to: $RESULT_DIR" +$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED" +(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB" +[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER" # ── 1. Save system state ──────────────────────────────── log_info "Capturing system state..." bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null -# ── 2. Discover available toolboxes and models ────────── +# ── 2. Discover available toolboxes ───────────────────── existing="$(detect_toolbox_names 2>/dev/null || true)" -# Map toolbox names to llama-bench commands (same pattern as upstream) declare -A BENCH_PATHS=( [llama-vulkan-radv]="/usr/sbin/llama-bench" [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" @@ -47,24 +76,70 @@ if (( ${#available_backends[@]} == 0 )); then exit 1 fi -# Find models -mapfile -t MODEL_PATHS < <( +# ── 3. Discover and filter models ─────────────────────── +mapfile -t ALL_MODEL_PATHS < <( find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ | sort ) +MODEL_PATHS=() +for p in "${ALL_MODEL_PATHS[@]}"; do + # Size filter + if (( MAX_SIZE_GB > 0 )); then + file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0) + file_size_gb=$(( file_size_bytes / 1073741824 )) + if (( file_size_gb >= MAX_SIZE_GB )); then + log_info "Skipping $(basename "$p") ($(( file_size_gb )) GB > ${MAX_SIZE_GB} GB limit)" + continue + fi + fi + + # Category filter (match against models.conf if available) + if [[ -n "$CATEGORY_FILTER" ]]; then + local_name="$(basename "$p")" + matched=false + if [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then + while IFS='|' read -r name repo file size_gb category desc; do + [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue + if [[ "$local_name" == "$file" ]]; then + if echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category"; then + matched=true + fi + break + fi + done < "$PROJECT_ROOT/configs/models.conf" + fi + # If model not in catalog, include it (don't filter unknowns) + if ! $matched; then + # Check if we found it in catalog at all + found_in_catalog=false + while IFS='|' read -r name repo file size_gb category desc; do + [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue + [[ "$local_name" == "$file" ]] && found_in_catalog=true && break + done < "$PROJECT_ROOT/configs/models.conf" + if $found_in_catalog; then + log_info "Skipping $(basename "$p") (category not in: $CATEGORY_FILTER)" + continue + fi + fi + fi + + MODEL_PATHS+=("$p") +done + if (( ${#MODEL_PATHS[@]} == 0 )); then - log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup" + log_error "No models matched filters. Adjust --max-size or --category" exit 1 fi -log_info "Found ${#MODEL_PATHS[@]} model(s):" +log_info "Benchmarking ${#MODEL_PATHS[@]} model(s):" for p in "${MODEL_PATHS[@]}"; do - printf " %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)" + file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0) + printf " %s (%.1f GB)\n" "$(basename "$p")" "$(echo "scale=1; $file_size_bytes / 1073741824" | bc)" done -# ── 3. Start metric logging ───────────────────────────── +# ── 4. Start metric logging ───────────────────────────── METRICS_FILE="$RESULT_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & METRICS_PID=$! @@ -76,7 +151,7 @@ cleanup() { } trap cleanup EXIT -# ── 4. Run benchmarks ─────────────────────────────────── +# ── 5. Run benchmarks ─────────────────────────────────── for MODEL_PATH in "${MODEL_PATHS[@]}"; do MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" @@ -110,6 +185,10 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do fi # Long-context test (pp2048, tg32, ctx 32768) + if $SKIP_LONGCTX; then + continue + fi + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME" @@ -136,7 +215,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do done done -# ── 5. Parse results into summary JSON ────────────────── +# ── 6. Parse results into summary JSON ────────────────── log_info "Parsing results..." SUMMARY="$RESULT_DIR/summary.json" @@ -152,7 +231,6 @@ for logfile in sorted(result_dir.glob("*.log")): if "FAILED" in content: continue - # Parse the pipe-delimited llama-bench table for line in content.splitlines(): line = line.strip() if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()): @@ -164,14 +242,12 @@ for logfile in sorted(result_dir.glob("*.log")): if len(parts) < 10: continue - # Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s | try: test_type = parts[8].strip() if len(parts) > 8 else "" ts_raw = parts[9].strip() if len(parts) > 9 else "" if not test_type or not ts_raw: continue - # Parse "548.18 +/- 1.59" or just "548.18" ts_match = re.match(r'([\d.]+)', ts_raw) if not ts_match: continue @@ -191,7 +267,7 @@ for logfile in sorted(result_dir.glob("*.log")): print(json.dumps({"results": results}, indent=2)) PYEOF -# ── 6. Display summary ────────────────────────────────── +# ── 7. Display summary ────────────────────────────────── log_header "Baseline Results" python3 - "$SUMMARY" << 'PYEOF' @@ -204,7 +280,6 @@ if not data["results"]: print(" No results parsed. Check log files for errors.") sys.exit(0) -# Print table fmt = " {:<20} {:<16} {:<8} {:>10}" print(fmt.format("Model", "Backend", "Test", "t/s")) print(" " + "-" * 58) diff --git a/scripts/benchmark/run-suite.sh b/scripts/benchmark/run-suite.sh index e18f213..5433b86 100644 --- a/scripts/benchmark/run-suite.sh +++ b/scripts/benchmark/run-suite.sh @@ -11,12 +11,33 @@ MODEL_DIR="$(data_dir models)" TAG="run" BACKENDS_FILTER="" MODELS_FILTER="" +SKIP_LONGCTX=false +MAX_SIZE_GB=0 +CATEGORY_FILTER="" +REPS_STANDARD=5 +REPS_LONGCTX=3 while [[ $# -gt 0 ]]; do case "$1" in - --tag|-t) TAG="$2"; shift 2 ;; - --backends|-b) BACKENDS_FILTER="$2"; shift 2 ;; - --models|-m) MODELS_FILTER="$2"; shift 2 ;; + --tag|-t) TAG="$2"; shift 2 ;; + --backends|-b) BACKENDS_FILTER="$2"; shift 2 ;; + --models|-m) MODELS_FILTER="$2"; shift 2 ;; + --skip-longctx) SKIP_LONGCTX=true; shift ;; + --max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;; + --category|-c) CATEGORY_FILTER="$2"; shift 2 ;; + --reps|-r) REPS_STANDARD="$2"; shift 2 ;; + --help|-h) + echo "Usage: run-suite.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --tag NAME Tag this run (default: run)" + echo " --backends LIST Comma-separated backend filter" + echo " --models LIST Comma-separated model filename filter" + echo " --skip-longctx Skip long-context (32K) tests" + echo " --max-size GB Only bench models up to this file size in GB" + echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)" + echo " --reps N Standard test repetitions (default: 5)" + exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; esac done @@ -25,9 +46,6 @@ TS="$(timestamp)" RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}" mkdir -p "$RESULT_DIR" -REPS_STANDARD=5 -REPS_LONGCTX=3 - log_header "Benchmark Suite: $TAG" log_info "Results: $RESULT_DIR" @@ -60,26 +78,56 @@ if (( ${#available_backends[@]} == 0 )); then fi log_info "Backends: ${available_backends[*]}" -# Find models -mapfile -t MODEL_PATHS < <( +# Find and filter models +mapfile -t ALL_MODEL_PATHS < <( find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ | sort ) -if [[ -n "$MODELS_FILTER" ]]; then - filtered=() - for p in "${MODEL_PATHS[@]}"; do - name="$(basename "$p")" - if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then - filtered+=("$p") +MODEL_PATHS=() +for p in "${ALL_MODEL_PATHS[@]}"; do + local_name="$(basename "$p")" + + # Name filter + if [[ -n "$MODELS_FILTER" ]]; then + if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then + continue fi - done - MODEL_PATHS=("${filtered[@]}") -fi + fi + + # Size filter + if (( MAX_SIZE_GB > 0 )); then + file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 )) + if (( file_size_gb >= MAX_SIZE_GB )); then + log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)" + continue + fi + fi + + # Category filter + if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then + matched=false + found_in_catalog=false + while IFS='|' read -r name repo file size_gb category desc; do + [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue + if [[ "$local_name" == "$file" ]]; then + found_in_catalog=true + echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true + break + fi + done < "$PROJECT_ROOT/configs/models.conf" + if $found_in_catalog && ! $matched; then + log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)" + continue + fi + fi + + MODEL_PATHS+=("$p") +done if (( ${#MODEL_PATHS[@]} == 0 )); then - log_error "No models found. Run: make benchmark-setup" + log_error "No models matched filters. Run: make benchmark-setup" exit 1 fi log_info "Models: ${#MODEL_PATHS[@]}" @@ -115,6 +163,9 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do fi # Long-context test + if $SKIP_LONGCTX; then + continue + fi OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME" diff --git a/tests/benchmark_flags.bats b/tests/benchmark_flags.bats new file mode 100644 index 0000000..923a363 --- /dev/null +++ b/tests/benchmark_flags.bats @@ -0,0 +1,37 @@ +#!/usr/bin/env bats +# Tests for benchmark script filtering flags + +load test_helper.sh + +@test "baseline --help shows usage and exits 0" { + run bash "$PROJECT_ROOT/scripts/benchmark/run-baseline.sh" --help + assert_success + assert_output --partial "Usage" + assert_output --partial "--max-size" + assert_output --partial "--category" + assert_output --partial "--skip-longctx" +} + +@test "run-suite --help shows usage and exits 0" { + run bash "$PROJECT_ROOT/scripts/benchmark/run-suite.sh" --help + assert_success + assert_output --partial "Usage" + assert_output --partial "--max-size" + assert_output --partial "--category" + assert_output --partial "--skip-longctx" + assert_output --partial "--tag" +} + +@test "benchmark dispatcher shows help with no args" { + run bash "$PROJECT_ROOT/bin/benchmark" + assert_failure + assert_output --partial "Commands" + assert_output --partial "--max-size" + assert_output --partial "--skip-longctx" +} + +@test "benchmark dispatcher passes --help through to baseline" { + run bash "$PROJECT_ROOT/bin/benchmark" baseline --help + assert_success + assert_output --partial "Usage" +}