feat: add benchmark filtering (--max-size, --category, --skip-longctx)
Both run-baseline.sh and run-suite.sh now support: - --max-size GB: skip models larger than N GB (prevents OOM) - --category LIST: filter by catalog category (smoke,dense,moe) - --skip-longctx: skip 32K context tests (saves time + memory) - --reps N: configure repetition count - --help: shows usage with examples Safe pre-optimization run: benchmark baseline --max-size 20 --skip-longctx Full post-optimization: benchmark baseline (no filters, all models + longctx) Also: 4 new BATS tests for flag parsing (98 total, all passing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -11,10 +11,22 @@ case "${1:-help}" in
|
|||||||
compare) exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;;
|
compare) exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;;
|
||||||
*)
|
*)
|
||||||
echo "Usage: benchmark <command> [options]"
|
echo "Usage: benchmark <command> [options]"
|
||||||
|
echo ""
|
||||||
|
echo "Commands:"
|
||||||
echo " setup Ensure toolboxes and test models are ready"
|
echo " setup Ensure toolboxes and test models are ready"
|
||||||
echo " baseline Capture pre-optimization baseline"
|
echo " baseline Capture pre-optimization baseline"
|
||||||
echo " run Run full benchmark suite (--tag NAME, --backends LIST)"
|
echo " run Run full benchmark suite"
|
||||||
echo " compare Compare two runs (DIR1 DIR2)"
|
echo " compare Compare two runs (DIR1 DIR2)"
|
||||||
|
echo ""
|
||||||
|
echo "Filtering options (baseline and run):"
|
||||||
|
echo " --max-size GB Only models up to this file size"
|
||||||
|
echo " --category LIST Comma-separated: smoke,dense,moe"
|
||||||
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " benchmark baseline --max-size 20 --skip-longctx"
|
||||||
|
echo " benchmark run --tag post-opt --category moe"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
@@ -14,18 +14,47 @@ mkdir -p "$RESULT_DIR"
|
|||||||
|
|
||||||
REPS_STANDARD=5
|
REPS_STANDARD=5
|
||||||
REPS_LONGCTX=3
|
REPS_LONGCTX=3
|
||||||
|
SKIP_LONGCTX=false
|
||||||
|
MAX_SIZE_GB=0 # 0 = no limit
|
||||||
|
CATEGORY_FILTER=""
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
||||||
|
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
||||||
|
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
||||||
|
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
||||||
|
--help|-h)
|
||||||
|
echo "Usage: run-baseline.sh [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
|
echo " --max-size GB Only bench models up to this file size in GB"
|
||||||
|
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
||||||
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
|
||||||
|
echo " run-baseline.sh --category smoke,moe # Only smoke + MoE models"
|
||||||
|
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
|
||||||
|
exit 0 ;;
|
||||||
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
log_header "Baseline Benchmark Capture"
|
log_header "Baseline Benchmark Capture"
|
||||||
log_info "Results will be saved to: $RESULT_DIR"
|
log_info "Results will be saved to: $RESULT_DIR"
|
||||||
|
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
|
||||||
|
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
|
||||||
|
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
|
||||||
|
|
||||||
# ── 1. Save system state ────────────────────────────────
|
# ── 1. Save system state ────────────────────────────────
|
||||||
log_info "Capturing system state..."
|
log_info "Capturing system state..."
|
||||||
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
||||||
|
|
||||||
# ── 2. Discover available toolboxes and models ──────────
|
# ── 2. Discover available toolboxes ─────────────────────
|
||||||
existing="$(detect_toolbox_names 2>/dev/null || true)"
|
existing="$(detect_toolbox_names 2>/dev/null || true)"
|
||||||
|
|
||||||
# Map toolbox names to llama-bench commands (same pattern as upstream)
|
|
||||||
declare -A BENCH_PATHS=(
|
declare -A BENCH_PATHS=(
|
||||||
[llama-vulkan-radv]="/usr/sbin/llama-bench"
|
[llama-vulkan-radv]="/usr/sbin/llama-bench"
|
||||||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||||||
@@ -47,24 +76,70 @@ if (( ${#available_backends[@]} == 0 )); then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Find models
|
# ── 3. Discover and filter models ───────────────────────
|
||||||
mapfile -t MODEL_PATHS < <(
|
mapfile -t ALL_MODEL_PATHS < <(
|
||||||
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
||||||
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
|
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
|
||||||
| sort
|
| sort
|
||||||
)
|
)
|
||||||
|
|
||||||
|
MODEL_PATHS=()
|
||||||
|
for p in "${ALL_MODEL_PATHS[@]}"; do
|
||||||
|
# Size filter
|
||||||
|
if (( MAX_SIZE_GB > 0 )); then
|
||||||
|
file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
|
||||||
|
file_size_gb=$(( file_size_bytes / 1073741824 ))
|
||||||
|
if (( file_size_gb >= MAX_SIZE_GB )); then
|
||||||
|
log_info "Skipping $(basename "$p") ($(( file_size_gb )) GB > ${MAX_SIZE_GB} GB limit)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Category filter (match against models.conf if available)
|
||||||
|
if [[ -n "$CATEGORY_FILTER" ]]; then
|
||||||
|
local_name="$(basename "$p")"
|
||||||
|
matched=false
|
||||||
|
if [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
|
||||||
|
while IFS='|' read -r name repo file size_gb category desc; do
|
||||||
|
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
|
||||||
|
if [[ "$local_name" == "$file" ]]; then
|
||||||
|
if echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category"; then
|
||||||
|
matched=true
|
||||||
|
fi
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done < "$PROJECT_ROOT/configs/models.conf"
|
||||||
|
fi
|
||||||
|
# If model not in catalog, include it (don't filter unknowns)
|
||||||
|
if ! $matched; then
|
||||||
|
# Check if we found it in catalog at all
|
||||||
|
found_in_catalog=false
|
||||||
|
while IFS='|' read -r name repo file size_gb category desc; do
|
||||||
|
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
|
||||||
|
[[ "$local_name" == "$file" ]] && found_in_catalog=true && break
|
||||||
|
done < "$PROJECT_ROOT/configs/models.conf"
|
||||||
|
if $found_in_catalog; then
|
||||||
|
log_info "Skipping $(basename "$p") (category not in: $CATEGORY_FILTER)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODEL_PATHS+=("$p")
|
||||||
|
done
|
||||||
|
|
||||||
if (( ${#MODEL_PATHS[@]} == 0 )); then
|
if (( ${#MODEL_PATHS[@]} == 0 )); then
|
||||||
log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup"
|
log_error "No models matched filters. Adjust --max-size or --category"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_info "Found ${#MODEL_PATHS[@]} model(s):"
|
log_info "Benchmarking ${#MODEL_PATHS[@]} model(s):"
|
||||||
for p in "${MODEL_PATHS[@]}"; do
|
for p in "${MODEL_PATHS[@]}"; do
|
||||||
printf " %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)"
|
file_size_bytes=$(stat -Lc%s "$p" 2>/dev/null || echo 0)
|
||||||
|
printf " %s (%.1f GB)\n" "$(basename "$p")" "$(echo "scale=1; $file_size_bytes / 1073741824" | bc)"
|
||||||
done
|
done
|
||||||
|
|
||||||
# ── 3. Start metric logging ─────────────────────────────
|
# ── 4. Start metric logging ─────────────────────────────
|
||||||
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
||||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
||||||
METRICS_PID=$!
|
METRICS_PID=$!
|
||||||
@@ -76,7 +151,7 @@ cleanup() {
|
|||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
# ── 4. Run benchmarks ───────────────────────────────────
|
# ── 5. Run benchmarks ───────────────────────────────────
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
||||||
|
|
||||||
@@ -110,6 +185,10 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Long-context test (pp2048, tg32, ctx 32768)
|
# Long-context test (pp2048, tg32, ctx 32768)
|
||||||
|
if $SKIP_LONGCTX; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
|
printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
|
||||||
@@ -136,7 +215,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
# ── 5. Parse results into summary JSON ──────────────────
|
# ── 6. Parse results into summary JSON ──────────────────
|
||||||
log_info "Parsing results..."
|
log_info "Parsing results..."
|
||||||
SUMMARY="$RESULT_DIR/summary.json"
|
SUMMARY="$RESULT_DIR/summary.json"
|
||||||
|
|
||||||
@@ -152,7 +231,6 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
if "FAILED" in content:
|
if "FAILED" in content:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse the pipe-delimited llama-bench table
|
|
||||||
for line in content.splitlines():
|
for line in content.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||||||
@@ -164,14 +242,12 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
if len(parts) < 10:
|
if len(parts) < 10:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s |
|
|
||||||
try:
|
try:
|
||||||
test_type = parts[8].strip() if len(parts) > 8 else ""
|
test_type = parts[8].strip() if len(parts) > 8 else ""
|
||||||
ts_raw = parts[9].strip() if len(parts) > 9 else ""
|
ts_raw = parts[9].strip() if len(parts) > 9 else ""
|
||||||
if not test_type or not ts_raw:
|
if not test_type or not ts_raw:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse "548.18 +/- 1.59" or just "548.18"
|
|
||||||
ts_match = re.match(r'([\d.]+)', ts_raw)
|
ts_match = re.match(r'([\d.]+)', ts_raw)
|
||||||
if not ts_match:
|
if not ts_match:
|
||||||
continue
|
continue
|
||||||
@@ -191,7 +267,7 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
print(json.dumps({"results": results}, indent=2))
|
print(json.dumps({"results": results}, indent=2))
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|
||||||
# ── 6. Display summary ──────────────────────────────────
|
# ── 7. Display summary ──────────────────────────────────
|
||||||
log_header "Baseline Results"
|
log_header "Baseline Results"
|
||||||
|
|
||||||
python3 - "$SUMMARY" << 'PYEOF'
|
python3 - "$SUMMARY" << 'PYEOF'
|
||||||
@@ -204,7 +280,6 @@ if not data["results"]:
|
|||||||
print(" No results parsed. Check log files for errors.")
|
print(" No results parsed. Check log files for errors.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# Print table
|
|
||||||
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
||||||
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
||||||
print(" " + "-" * 58)
|
print(" " + "-" * 58)
|
||||||
|
|||||||
@@ -11,12 +11,33 @@ MODEL_DIR="$(data_dir models)"
|
|||||||
TAG="run"
|
TAG="run"
|
||||||
BACKENDS_FILTER=""
|
BACKENDS_FILTER=""
|
||||||
MODELS_FILTER=""
|
MODELS_FILTER=""
|
||||||
|
SKIP_LONGCTX=false
|
||||||
|
MAX_SIZE_GB=0
|
||||||
|
CATEGORY_FILTER=""
|
||||||
|
REPS_STANDARD=5
|
||||||
|
REPS_LONGCTX=3
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--tag|-t) TAG="$2"; shift 2 ;;
|
--tag|-t) TAG="$2"; shift 2 ;;
|
||||||
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
|
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
|
||||||
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
|
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
|
||||||
|
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
||||||
|
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
||||||
|
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
||||||
|
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
||||||
|
--help|-h)
|
||||||
|
echo "Usage: run-suite.sh [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --tag NAME Tag this run (default: run)"
|
||||||
|
echo " --backends LIST Comma-separated backend filter"
|
||||||
|
echo " --models LIST Comma-separated model filename filter"
|
||||||
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
|
echo " --max-size GB Only bench models up to this file size in GB"
|
||||||
|
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
||||||
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
exit 0 ;;
|
||||||
*) log_warn "Unknown argument: $1"; shift ;;
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@@ -25,9 +46,6 @@ TS="$(timestamp)"
|
|||||||
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
||||||
mkdir -p "$RESULT_DIR"
|
mkdir -p "$RESULT_DIR"
|
||||||
|
|
||||||
REPS_STANDARD=5
|
|
||||||
REPS_LONGCTX=3
|
|
||||||
|
|
||||||
log_header "Benchmark Suite: $TAG"
|
log_header "Benchmark Suite: $TAG"
|
||||||
log_info "Results: $RESULT_DIR"
|
log_info "Results: $RESULT_DIR"
|
||||||
|
|
||||||
@@ -60,26 +78,56 @@ if (( ${#available_backends[@]} == 0 )); then
|
|||||||
fi
|
fi
|
||||||
log_info "Backends: ${available_backends[*]}"
|
log_info "Backends: ${available_backends[*]}"
|
||||||
|
|
||||||
# Find models
|
# Find and filter models
|
||||||
mapfile -t MODEL_PATHS < <(
|
mapfile -t ALL_MODEL_PATHS < <(
|
||||||
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
||||||
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
|
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
|
||||||
| sort
|
| sort
|
||||||
)
|
)
|
||||||
|
|
||||||
if [[ -n "$MODELS_FILTER" ]]; then
|
MODEL_PATHS=()
|
||||||
filtered=()
|
for p in "${ALL_MODEL_PATHS[@]}"; do
|
||||||
for p in "${MODEL_PATHS[@]}"; do
|
local_name="$(basename "$p")"
|
||||||
name="$(basename "$p")"
|
|
||||||
if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then
|
# Name filter
|
||||||
filtered+=("$p")
|
if [[ -n "$MODELS_FILTER" ]]; then
|
||||||
|
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$local_name"; then
|
||||||
|
continue
|
||||||
fi
|
fi
|
||||||
done
|
fi
|
||||||
MODEL_PATHS=("${filtered[@]}")
|
|
||||||
fi
|
# Size filter
|
||||||
|
if (( MAX_SIZE_GB > 0 )); then
|
||||||
|
file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 ))
|
||||||
|
if (( file_size_gb >= MAX_SIZE_GB )); then
|
||||||
|
log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Category filter
|
||||||
|
if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
|
||||||
|
matched=false
|
||||||
|
found_in_catalog=false
|
||||||
|
while IFS='|' read -r name repo file size_gb category desc; do
|
||||||
|
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
|
||||||
|
if [[ "$local_name" == "$file" ]]; then
|
||||||
|
found_in_catalog=true
|
||||||
|
echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done < "$PROJECT_ROOT/configs/models.conf"
|
||||||
|
if $found_in_catalog && ! $matched; then
|
||||||
|
log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODEL_PATHS+=("$p")
|
||||||
|
done
|
||||||
|
|
||||||
if (( ${#MODEL_PATHS[@]} == 0 )); then
|
if (( ${#MODEL_PATHS[@]} == 0 )); then
|
||||||
log_error "No models found. Run: make benchmark-setup"
|
log_error "No models matched filters. Run: make benchmark-setup"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
log_info "Models: ${#MODEL_PATHS[@]}"
|
log_info "Models: ${#MODEL_PATHS[@]}"
|
||||||
@@ -115,6 +163,9 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Long-context test
|
# Long-context test
|
||||||
|
if $SKIP_LONGCTX; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
|
printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
|
||||||
|
|||||||
37
tests/benchmark_flags.bats
Normal file
37
tests/benchmark_flags.bats
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
# Tests for benchmark script filtering flags
|
||||||
|
|
||||||
|
load test_helper.sh
|
||||||
|
|
||||||
|
@test "baseline --help shows usage and exits 0" {
|
||||||
|
run bash "$PROJECT_ROOT/scripts/benchmark/run-baseline.sh" --help
|
||||||
|
assert_success
|
||||||
|
assert_output --partial "Usage"
|
||||||
|
assert_output --partial "--max-size"
|
||||||
|
assert_output --partial "--category"
|
||||||
|
assert_output --partial "--skip-longctx"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "run-suite --help shows usage and exits 0" {
|
||||||
|
run bash "$PROJECT_ROOT/scripts/benchmark/run-suite.sh" --help
|
||||||
|
assert_success
|
||||||
|
assert_output --partial "Usage"
|
||||||
|
assert_output --partial "--max-size"
|
||||||
|
assert_output --partial "--category"
|
||||||
|
assert_output --partial "--skip-longctx"
|
||||||
|
assert_output --partial "--tag"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "benchmark dispatcher shows help with no args" {
|
||||||
|
run bash "$PROJECT_ROOT/bin/benchmark"
|
||||||
|
assert_failure
|
||||||
|
assert_output --partial "Commands"
|
||||||
|
assert_output --partial "--max-size"
|
||||||
|
assert_output --partial "--skip-longctx"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "benchmark dispatcher passes --help through to baseline" {
|
||||||
|
run bash "$PROJECT_ROOT/bin/benchmark" baseline --help
|
||||||
|
assert_success
|
||||||
|
assert_output --partial "Usage"
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user