feat: add benchmark filtering (--max-size, --category, --skip-longctx)

Both run-baseline.sh and run-suite.sh now support: - --max-size GB: skip models larger than N GB (prevents OOM) - --category LIST: filter by catalog category (smoke,dense,moe) - --skip-longctx: skip 32K context tests (saves time + memory) - --reps N: configure repetition count - --help: shows usage with examples Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx Full post-optimization: benchmark baseline (no filters, all models + longctx) Also: 4 new BATS tests for flag parsing (98 total, all passing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 19:07:24 +01:00
parent eb52ea52ce
commit cb25fa3f6f
4 changed files with 209 additions and 34 deletions
--- a/bin/benchmark
+++ b/bin/benchmark
@@ -11,10 +11,22 @@ case "${1:-help}" in
    compare)  exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;;
    *)
        echo "Usage: benchmark <command> [options]"
+        echo ""
+        echo "Commands:"
        echo "  setup     Ensure toolboxes and test models are ready"
        echo "  baseline  Capture pre-optimization baseline"
-        echo "  run       Run full benchmark suite (--tag NAME, --backends LIST)"
+        echo "  run       Run full benchmark suite"
        echo "  compare   Compare two runs (DIR1 DIR2)"
+        echo ""
+        echo "Filtering options (baseline and run):"
+        echo "  --max-size GB       Only models up to this file size"
+        echo "  --category LIST     Comma-separated: smoke,dense,moe"
+        echo "  --skip-longctx      Skip long-context (32K) tests"
+        echo "  --reps N            Standard test repetitions (default: 5)"
+        echo ""
+        echo "Examples:"
+        echo "  benchmark baseline --max-size 20 --skip-longctx"
+        echo "  benchmark run --tag post-opt --category moe"
        exit 1
        ;;
 esac
--- a/scripts/benchmark/run-baseline.sh
+++ b/scripts/benchmark/run-baseline.sh
@@ -14,18 +14,47 @@ mkdir -p "$RESULT_DIR"

 REPS_STANDARD=5
 REPS_LONGCTX=3
+SKIP_LONGCTX=false
+MAX_SIZE_GB=0  # 0 = no limit
+CATEGORY_FILTER=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --skip-longctx)  SKIP_LONGCTX=true; shift ;;
+        --max-size|-s)   MAX_SIZE_GB="$2"; shift 2 ;;
+        --category|-c)   CATEGORY_FILTER="$2"; shift 2 ;;
+        --reps|-r)       REPS_STANDARD="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: run-baseline.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --skip-longctx       Skip long-context (32K) tests"
+            echo "  --max-size GB        Only bench models up to this file size in GB"
+            echo "  --category LIST      Comma-separated: smoke,dense,moe (from models.conf)"
+            echo "  --reps N             Standard test repetitions (default: 5)"
+            echo ""
+            echo "Examples:"
+            echo "  run-baseline.sh --max-size 20               # Only models ≤20 GB"
+            echo "  run-baseline.sh --category smoke,moe         # Only smoke + MoE models"
+            echo "  run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
+            exit 0 ;;
+        *) log_warn "Unknown argument: $1"; shift ;;
+    esac
+done

 log_header "Baseline Benchmark Capture"
 log_info "Results will be saved to: $RESULT_DIR"
+$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
+(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
+[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"

 # ── 1. Save system state ────────────────────────────────
 log_info "Capturing system state..."
 bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null

-# ── 2. Discover available toolboxes and models ──────────
+# ── 2. Discover available toolboxes ─────────────────────
 existing="$(detect_toolbox_names 2>/dev/null || true)"

-# Map toolbox names to llama-bench commands (same pattern as upstream)
 declare -A BENCH_PATHS=(
    [llama-vulkan-radv]="/usr/sbin/llama-bench"
    [llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
@@ -47,24 +76,70 @@ if (( ${#available_backends[@]} == 0 )); then
    exit 1
 fi

-# Find models
-mapfile -t MODEL_PATHS < <(
+# ── 3. Discover and filter models ───────────────────────
+mapfile -t ALL_MODEL_PATHS < <(
    find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
        \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
        | sort
 )

+MODEL_PATHS=()
+for p in "${ALL_MODEL_PATHS[@]}"; do
+    # Size filter
+    if (( MAX_SIZE_GB > 0 )); then
+        file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
+        file_size_gb=$(( file_size_bytes / 1073741824 ))
+        if (( file_size_gb >= MAX_SIZE_GB )); then
+            log_info "Skipping $(basename "$p") ($(( file_size_gb )) GB > ${MAX_SIZE_GB} GB limit)"
+            continue
+        fi
+    fi
+
+    # Category filter (match against models.conf if available)
+    if [[ -n "$CATEGORY_FILTER" ]]; then
+        local_name="$(basename "$p")"
+        matched=false
+        if [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
+            while IFS='|' read -r name repo file size_gb category desc; do
+                [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
+                if [[ "$local_name" == "$file" ]]; then
+                    if echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category"; then
+                        matched=true
+                    fi
+                    break
+                fi
+            done < "$PROJECT_ROOT/configs/models.conf"
+        fi
+        # If model not in catalog, include it (don't filter unknowns)
+        if ! $matched; then
+            # Check if we found it in catalog at all
+            found_in_catalog=false
+            while IFS='|' read -r name repo file size_gb category desc; do
+                [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
+                [[ "$local_name" == "$file" ]] && found_in_catalog=true && break
+            done < "$PROJECT_ROOT/configs/models.conf"
+            if $found_in_catalog; then
+                log_info "Skipping $(basename "$p") (category not in: $CATEGORY_FILTER)"
+                continue
+            fi
+        fi
+    fi
+
+    MODEL_PATHS+=("$p")
+done
+
 if (( ${#MODEL_PATHS[@]} == 0 )); then
-    log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup"
+    log_error "No models matched filters. Adjust --max-size or --category"
    exit 1
 fi

-log_info "Found ${#MODEL_PATHS[@]} model(s):"
+log_info "Benchmarking ${#MODEL_PATHS[@]} model(s):"
 for p in "${MODEL_PATHS[@]}"; do
-    printf "  %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)"
+    file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
+    printf "  %s (%.1f GB)\n" "$(basename "$p")" "$(echo "scale=1; $file_size_bytes / 1073741824" | bc)"
 done

-# ── 3. Start metric logging ─────────────────────────────
+# ── 4. Start metric logging ─────────────────────────────
 METRICS_FILE="$RESULT_DIR/metrics.csv"
 bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
 METRICS_PID=$!
@@ -76,7 +151,7 @@ cleanup() {
 }
 trap cleanup EXIT

-# ── 4. Run benchmarks ───────────────────────────────────
+# ── 5. Run benchmarks ───────────────────────────────────
 for MODEL_PATH in "${MODEL_PATHS[@]}"; do
    MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"

@@ -110,6 +185,10 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
        fi

        # Long-context test (pp2048, tg32, ctx 32768)
+        if $SKIP_LONGCTX; then
+            continue
+        fi
+
        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
        if [[ ! -s "$OUT_LC" ]]; then
            printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
@@ -136,7 +215,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
    done
 done

-# ── 5. Parse results into summary JSON ──────────────────
+# ── 6. Parse results into summary JSON ──────────────────
 log_info "Parsing results..."
 SUMMARY="$RESULT_DIR/summary.json"

@@ -152,7 +231,6 @@ for logfile in sorted(result_dir.glob("*.log")):
    if "FAILED" in content:
        continue

-    # Parse the pipe-delimited llama-bench table
    for line in content.splitlines():
        line = line.strip()
        if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
@@ -164,14 +242,12 @@ for logfile in sorted(result_dir.glob("*.log")):
        if len(parts) < 10:
            continue

-        # Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s |
        try:
            test_type = parts[8].strip() if len(parts) > 8 else ""
            ts_raw = parts[9].strip() if len(parts) > 9 else ""
            if not test_type or not ts_raw:
                continue

-            # Parse "548.18 +/- 1.59" or just "548.18"
            ts_match = re.match(r'([\d.]+)', ts_raw)
            if not ts_match:
                continue
@@ -191,7 +267,7 @@ for logfile in sorted(result_dir.glob("*.log")):
 print(json.dumps({"results": results}, indent=2))
 PYEOF

-# ── 6. Display summary ──────────────────────────────────
+# ── 7. Display summary ──────────────────────────────────
 log_header "Baseline Results"

 python3 - "$SUMMARY" << 'PYEOF'
@@ -204,7 +280,6 @@ if not data["results"]:
    print("  No results parsed. Check log files for errors.")
    sys.exit(0)

-# Print table
 fmt = "  {:<20} {:<16} {:<8} {:>10}"
 print(fmt.format("Model", "Backend", "Test", "t/s"))
 print("  " + "-" * 58)
--- a/scripts/benchmark/run-suite.sh
+++ b/scripts/benchmark/run-suite.sh
@@ -11,12 +11,33 @@ MODEL_DIR="$(data_dir models)"
 TAG="run"
 BACKENDS_FILTER=""
 MODELS_FILTER=""
+SKIP_LONGCTX=false
+MAX_SIZE_GB=0
+CATEGORY_FILTER=""
+REPS_STANDARD=5
+REPS_LONGCTX=3

 while [[ $# -gt 0 ]]; do
    case "$1" in
        --tag|-t)        TAG="$2"; shift 2 ;;
        --backends|-b)   BACKENDS_FILTER="$2"; shift 2 ;;
        --models|-m)     MODELS_FILTER="$2"; shift 2 ;;
+        --skip-longctx)  SKIP_LONGCTX=true; shift ;;
+        --max-size|-s)   MAX_SIZE_GB="$2"; shift 2 ;;
+        --category|-c)   CATEGORY_FILTER="$2"; shift 2 ;;
+        --reps|-r)       REPS_STANDARD="$2"; shift 2 ;;
+        --help|-h)
+            echo "Usage: run-suite.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --tag NAME           Tag this run (default: run)"
+            echo "  --backends LIST      Comma-separated backend filter"
+            echo "  --models LIST        Comma-separated model filename filter"
+            echo "  --skip-longctx       Skip long-context (32K) tests"
+            echo "  --max-size GB        Only bench models up to this file size in GB"
+            echo "  --category LIST      Comma-separated: smoke,dense,moe (from models.conf)"
+            echo "  --reps N             Standard test repetitions (default: 5)"
+            exit 0 ;;
        *) log_warn "Unknown argument: $1"; shift ;;
    esac
 done
@@ -25,9 +46,6 @@ TS="$(timestamp)"
 RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
 mkdir -p "$RESULT_DIR"

-REPS_STANDARD=5
-REPS_LONGCTX=3
-
 log_header "Benchmark Suite: $TAG"
 log_info "Results: $RESULT_DIR"

@@ -60,26 +78,56 @@ if (( ${#available_backends[@]} == 0 )); then
 fi
 log_info "Backends: ${available_backends[*]}"

-# Find models
-mapfile -t MODEL_PATHS < <(
+# Find and filter models
+mapfile -t ALL_MODEL_PATHS < <(
    find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
        \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
        | sort
 )

+MODEL_PATHS=()
+for p in "${ALL_MODEL_PATHS[@]}"; do
+    local_name="$(basename "$p")"
+
+    # Name filter
    if [[ -n "$MODELS_FILTER" ]]; then
-    filtered=()
-    for p in "${MODEL_PATHS[@]}"; do
-        name="$(basename "$p")"
-        if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then
-            filtered+=("$p")
+        if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
+            continue
        fi
-    done
-    MODEL_PATHS=("${filtered[@]}")
    fi

+    # Size filter
+    if (( MAX_SIZE_GB > 0 )); then
+        file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 ))
+        if (( file_size_gb >= MAX_SIZE_GB )); then
+            log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)"
+            continue
+        fi
+    fi
+
+    # Category filter
+    if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
+        matched=false
+        found_in_catalog=false
+        while IFS='|' read -r name repo file size_gb category desc; do
+            [[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
+            if [[ "$local_name" == "$file" ]]; then
+                found_in_catalog=true
+                echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true
+                break
+            fi
+        done < "$PROJECT_ROOT/configs/models.conf"
+        if $found_in_catalog && ! $matched; then
+            log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)"
+            continue
+        fi
+    fi
+
+    MODEL_PATHS+=("$p")
+done
+
 if (( ${#MODEL_PATHS[@]} == 0 )); then
-    log_error "No models found. Run: make benchmark-setup"
+    log_error "No models matched filters. Run: make benchmark-setup"
    exit 1
 fi
 log_info "Models: ${#MODEL_PATHS[@]}"
@@ -115,6 +163,9 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
        fi

        # Long-context test
+        if $SKIP_LONGCTX; then
+            continue
+        fi
        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
        if [[ ! -s "$OUT_LC" ]]; then
            printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
--- a/tests/benchmark_flags.bats
+++ b/tests/benchmark_flags.bats
@@ -0,0 +1,37 @@
+#!/usr/bin/env bats
+# Tests for benchmark script filtering flags
+
+load test_helper.sh
+
+@test "baseline --help shows usage and exits 0" {
+    run bash "$PROJECT_ROOT/scripts/benchmark/run-baseline.sh" --help
+    assert_success
+    assert_output --partial "Usage"
+    assert_output --partial "--max-size"
+    assert_output --partial "--category"
+    assert_output --partial "--skip-longctx"
+}
+
+@test "run-suite --help shows usage and exits 0" {
+    run bash "$PROJECT_ROOT/scripts/benchmark/run-suite.sh" --help
+    assert_success
+    assert_output --partial "Usage"
+    assert_output --partial "--max-size"
+    assert_output --partial "--category"
+    assert_output --partial "--skip-longctx"
+    assert_output --partial "--tag"
+}
+
+@test "benchmark dispatcher shows help with no args" {
+    run bash "$PROJECT_ROOT/bin/benchmark"
+    assert_failure
+    assert_output --partial "Commands"
+    assert_output --partial "--max-size"
+    assert_output --partial "--skip-longctx"
+}
+
+@test "benchmark dispatcher passes --help through to baseline" {
+    run bash "$PROJECT_ROOT/bin/benchmark" baseline --help
+    assert_success
+    assert_output --partial "Usage"
+}