feat: add --pp and --tg flags for realistic benchmark workloads

Standard benchmarks use pp512/tg128 which underestimates real-world agentic coding where responses are 500-2000 tokens. Now configurable: --pp N Prompt processing tokens (default: 512) --tg N Token generation count (default: 128) Examples: benchmark run --tag realistic --tg 1024 --pp 2048 --category moe benchmark run --tag full-response --tg 2048 --category moe --reps 3 Log filenames include pp/tg when non-default (e.g., model__backend__fa1__pp2048_tg1024.log) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:48:32 +01:00
parent 3686783f4d
commit 38daf953bf
2 changed files with 27 additions and 7 deletions
--- a/scripts/benchmark/run-baseline.sh
+++ b/scripts/benchmark/run-baseline.sh
@@ -19,6 +19,8 @@ MAX_SIZE_GB=0  # 0 = no limit
 CATEGORY_FILTER=""
 CTX_DEPTH=32768
 CTX_PROMPT=2048
+PP_TOKENS=512
+TG_TOKENS=128

 while [[ $# -gt 0 ]]; do
    case "$1" in
@@ -27,6 +29,8 @@ while [[ $# -gt 0 ]]; do
        --category|-c)   CATEGORY_FILTER="$2"; shift 2 ;;
        --reps|-r)       REPS_STANDARD="$2"; shift 2 ;;
        --context|-d)    CTX_DEPTH="$2"; shift 2 ;;
+        --pp)            PP_TOKENS="$2"; shift 2 ;;
+        --tg)            TG_TOKENS="$2"; shift 2 ;;
        --help|-h)
            echo "Usage: run-baseline.sh [OPTIONS]"
            echo ""
@@ -36,10 +40,13 @@ while [[ $# -gt 0 ]]; do
            echo "  --category LIST      Comma-separated: smoke,dense,moe (from models.conf)"
            echo "  --reps N             Standard test repetitions (default: 5)"
            echo "  --context N          Long-context depth in tokens (default: 32768)"
+            echo "  --pp N               Prompt processing tokens (default: 512)"
+            echo "  --tg N               Token generation count (default: 128)"
            echo ""
            echo "Examples:"
            echo "  run-baseline.sh --max-size 20               # Only models ≤20 GB"
            echo "  run-baseline.sh --context 131072 --category moe  # 128K context on MoE"
+            echo "  run-baseline.sh --tg 1024 --pp 2048 --category moe  # Realistic agentic"
            echo "  run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
            exit 0 ;;
        *) log_warn "Unknown argument: $1"; shift ;;
@@ -182,12 +189,15 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
            TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
        fi

-        # Standard test (pp512 + tg128, default context)
-        OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log"
+        # Standard test
+        local_suffix="fa1"
+        [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
+        OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
        if [[ ! -s "$OUT" ]]; then
-            printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME"
+            printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
            CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-                -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -r "$REPS_STANDARD")
+                -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
+                -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")

            printf "  cmd: %s\n" "${CMD[*]}"
            if "${CMD[@]}" > "$OUT" 2>&1; then