feat: add --context flag for configurable long-context benchmarks

Both run-baseline.sh and run-suite.sh now accept --context N to set the long-context depth (default: 32768). Prompt tokens auto-scale to ~1/16 of context depth for larger windows. Examples: benchmark run --tag ctx64k --context 65536 --category moe benchmark run --tag ctx128k --context 131072 --category moe --reps 3 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:46:16 +01:00
parent 1b5b193e81
commit 3686783f4d
2 changed files with 33 additions and 9 deletions
--- a/scripts/benchmark/run-baseline.sh
+++ b/scripts/benchmark/run-baseline.sh
@@ -17,6 +17,8 @@ REPS_LONGCTX=3
 SKIP_LONGCTX=false
 MAX_SIZE_GB=0  # 0 = no limit
 CATEGORY_FILTER=""
 CTX_DEPTH=32768
 CTX_PROMPT=2048
 while [[ $# -gt 0 ]]; do
    case "$1" in
@@ -24,24 +26,32 @@ while [[ $# -gt 0 ]]; do
        --max-size|-s)   MAX_SIZE_GB="$2"; shift 2 ;;
        --category|-c)   CATEGORY_FILTER="$2"; shift 2 ;;
        --reps|-r)       REPS_STANDARD="$2"; shift 2 ;;
        --context|-d)    CTX_DEPTH="$2"; shift 2 ;;
        --help|-h)
            echo "Usage: run-baseline.sh [OPTIONS]"
            echo ""
            echo "Options:"
-            echo "  --skip-longctx       Skip long-context (32K) tests"
+            echo "  --skip-longctx       Skip long-context tests"
            echo "  --max-size GB        Only bench models up to this file size in GB"
            echo "  --category LIST      Comma-separated: smoke,dense,moe (from models.conf)"
            echo "  --reps N             Standard test repetitions (default: 5)"
            echo "  --context N          Long-context depth in tokens (default: 32768)"
            echo ""
            echo "Examples:"
            echo "  run-baseline.sh --max-size 20               # Only models ≤20 GB"
-            echo "  run-baseline.sh --category smoke,moe         # Only smoke + MoE models"
+            echo "  run-baseline.sh --context 131072 --category moe  # 128K context on MoE"
            echo "  run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
            exit 0 ;;
        *) log_warn "Unknown argument: $1"; shift ;;
    esac
 done
 # Scale prompt tokens for large contexts: ~1/16 of depth, min 512
 if (( CTX_DEPTH > 32768 )); then
    CTX_PROMPT=$(( CTX_DEPTH / 16 ))
    (( CTX_PROMPT < 512 )) && CTX_PROMPT=512
 fi
 log_header "Baseline Benchmark Capture"
 log_info "Results will be saved to: $RESULT_DIR"
 $SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
@@ -196,16 +206,16 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
            continue
        fi
-        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
+        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
        if [[ ! -s "$OUT_LC" ]]; then
-            printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
+            printf "\n${BOLD}>> [%s] %s — long-context %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
            UB_SIZE=2048
            [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
            CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
                -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-                -p 2048 -n 32 -d 32768 -ub "$UB_SIZE"
+                -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
                -r "$REPS_LONGCTX")
            printf "  cmd: %s\n" "${CMD_LC[*]}"
--- a/scripts/benchmark/run-suite.sh
+++ b/scripts/benchmark/run-suite.sh
@@ -16,6 +16,8 @@ MAX_SIZE_GB=0
 CATEGORY_FILTER=""
 REPS_STANDARD=5
 REPS_LONGCTX=3
 CTX_DEPTH=32768
 CTX_PROMPT=2048
 while [[ $# -gt 0 ]]; do
    case "$1" in
@@ -26,6 +28,7 @@ while [[ $# -gt 0 ]]; do
        --max-size|-s)   MAX_SIZE_GB="$2"; shift 2 ;;
        --category|-c)   CATEGORY_FILTER="$2"; shift 2 ;;
        --reps|-r)       REPS_STANDARD="$2"; shift 2 ;;
        --context|-d)    CTX_DEPTH="$2"; shift 2 ;;
        --help|-h)
            echo "Usage: run-suite.sh [OPTIONS]"
            echo ""
@@ -33,15 +36,26 @@ while [[ $# -gt 0 ]]; do
            echo "  --tag NAME           Tag this run (default: run)"
            echo "  --backends LIST      Comma-separated backend filter"
            echo "  --models LIST        Comma-separated model filename filter"
-            echo "  --skip-longctx       Skip long-context (32K) tests"
+            echo "  --skip-longctx       Skip long-context tests"
            echo "  --max-size GB        Only bench models up to this file size in GB"
            echo "  --category LIST      Comma-separated: smoke,dense,moe (from models.conf)"
            echo "  --reps N             Standard test repetitions (default: 5)"
            echo "  --context N          Long-context depth in tokens (default: 32768)"
            echo ""
            echo "Examples:"
            echo "  run-suite.sh --tag ctx128k --context 131072 --category moe"
            echo "  run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
            exit 0 ;;
        *) log_warn "Unknown argument: $1"; shift ;;
    esac
 done
 # Scale prompt tokens for context: use ~1/16 of context depth, min 512
 if (( CTX_DEPTH > 32768 )); then
    CTX_PROMPT=$(( CTX_DEPTH / 16 ))
    (( CTX_PROMPT < 512 )) && CTX_PROMPT=512
 fi
 TS="$(timestamp)"
 RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
 mkdir -p "$RESULT_DIR"
@@ -172,13 +186,13 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
        if $SKIP_LONGCTX; then
            continue
        fi
-        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
+        OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
        if [[ ! -s "$OUT_LC" ]]; then
-            printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
+            printf "\n${BOLD}>> [%s] %s — longctx %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
            UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
            CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
                -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-                -p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX")
+                -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX")
            if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
                log_success "Done"; tail -3 "$OUT_LC"
            else