feat: add benchmark filtering (--max-size, --category, --skip-longctx)
Both run-baseline.sh and run-suite.sh now support: - --max-size GB: skip models larger than N GB (prevents OOM) - --category LIST: filter by catalog category (smoke,dense,moe) - --skip-longctx: skip 32K context tests (saves time + memory) - --reps N: configure repetition count - --help: shows usage with examples Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx Full post-optimization: benchmark baseline (no filters, all models + longctx) Also: 4 new BATS tests for flag parsing (98 total, all passing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
37
tests/benchmark_flags.bats
Normal file
37
tests/benchmark_flags.bats
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bats
|
||||
# Tests for benchmark script filtering flags
|
||||
|
||||
load test_helper.sh
|
||||
|
||||
@test "baseline --help shows usage and exits 0" {
|
||||
run bash "$PROJECT_ROOT/scripts/benchmark/run-baseline.sh" --help
|
||||
assert_success
|
||||
assert_output --partial "Usage"
|
||||
assert_output --partial "--max-size"
|
||||
assert_output --partial "--category"
|
||||
assert_output --partial "--skip-longctx"
|
||||
}
|
||||
|
||||
@test "run-suite --help shows usage and exits 0" {
|
||||
run bash "$PROJECT_ROOT/scripts/benchmark/run-suite.sh" --help
|
||||
assert_success
|
||||
assert_output --partial "Usage"
|
||||
assert_output --partial "--max-size"
|
||||
assert_output --partial "--category"
|
||||
assert_output --partial "--skip-longctx"
|
||||
assert_output --partial "--tag"
|
||||
}
|
||||
|
||||
@test "benchmark dispatcher shows help with no args" {
|
||||
run bash "$PROJECT_ROOT/bin/benchmark"
|
||||
assert_failure
|
||||
assert_output --partial "Commands"
|
||||
assert_output --partial "--max-size"
|
||||
assert_output --partial "--skip-longctx"
|
||||
}
|
||||
|
||||
@test "benchmark dispatcher passes --help through to baseline" {
|
||||
run bash "$PROJECT_ROOT/bin/benchmark" baseline --help
|
||||
assert_success
|
||||
assert_output --partial "Usage"
|
||||
}
|
||||
Reference in New Issue
Block a user