feat: add benchmark filtering (--max-size, --category, --skip-longctx)

Both run-baseline.sh and run-suite.sh now support: - --max-size GB: skip models larger than N GB (prevents OOM) - --category LIST: filter by catalog category (smoke,dense,moe) - --skip-longctx: skip 32K context tests (saves time + memory) - --reps N: configure repetition count - --help: shows usage with examples Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx Full post-optimization: benchmark baseline (no filters, all models + longctx) Also: 4 new BATS tests for flag parsing (98 total, all passing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 19:07:24 +01:00
parent eb52ea52ce
commit cb25fa3f6f
4 changed files with 209 additions and 34 deletions
--- a/bin/benchmark
+++ b/bin/benchmark
@@ -11,10 +11,22 @@ case "${1:-help}" in
    compare)  exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;;
    *)
        echo "Usage: benchmark <command> [options]"
+        echo ""
+        echo "Commands:"
        echo "  setup     Ensure toolboxes and test models are ready"
        echo "  baseline  Capture pre-optimization baseline"
-        echo "  run       Run full benchmark suite (--tag NAME, --backends LIST)"
+        echo "  run       Run full benchmark suite"
        echo "  compare   Compare two runs (DIR1 DIR2)"
+        echo ""
+        echo "Filtering options (baseline and run):"
+        echo "  --max-size GB       Only models up to this file size"
+        echo "  --category LIST     Comma-separated: smoke,dense,moe"
+        echo "  --skip-longctx      Skip long-context (32K) tests"
+        echo "  --reps N            Standard test repetitions (default: 5)"
+        echo ""
+        echo "Examples:"
+        echo "  benchmark baseline --max-size 20 --skip-longctx"
+        echo "  benchmark run --tag post-opt --category moe"
        exit 1
        ;;
 esac