feat: add --pp and --tg flags for realistic benchmark workloads

Standard benchmarks use pp512/tg128 which underestimates real-world
agentic coding where responses are 500-2000 tokens. Now configurable:

  --pp N    Prompt processing tokens (default: 512)
  --tg N    Token generation count (default: 128)

Examples:
  benchmark run --tag realistic --tg 1024 --pp 2048 --category moe
  benchmark run --tag full-response --tg 2048 --category moe --reps 3

Log filenames include pp/tg when non-default (e.g., model__backend__fa1__pp2048_tg1024.log)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Felipe Cardoso
2026-03-26 22:48:32 +01:00
parent 3686783f4d
commit 38daf953bf
2 changed files with 27 additions and 7 deletions

View File

@@ -19,6 +19,8 @@ MAX_SIZE_GB=0 # 0 = no limit
CATEGORY_FILTER="" CATEGORY_FILTER=""
CTX_DEPTH=32768 CTX_DEPTH=32768
CTX_PROMPT=2048 CTX_PROMPT=2048
PP_TOKENS=512
TG_TOKENS=128
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@@ -27,6 +29,8 @@ while [[ $# -gt 0 ]]; do
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;; --category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
--reps|-r) REPS_STANDARD="$2"; shift 2 ;; --reps|-r) REPS_STANDARD="$2"; shift 2 ;;
--context|-d) CTX_DEPTH="$2"; shift 2 ;; --context|-d) CTX_DEPTH="$2"; shift 2 ;;
--pp) PP_TOKENS="$2"; shift 2 ;;
--tg) TG_TOKENS="$2"; shift 2 ;;
--help|-h) --help|-h)
echo "Usage: run-baseline.sh [OPTIONS]" echo "Usage: run-baseline.sh [OPTIONS]"
echo "" echo ""
@@ -36,10 +40,13 @@ while [[ $# -gt 0 ]]; do
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)" echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
echo " --reps N Standard test repetitions (default: 5)" echo " --reps N Standard test repetitions (default: 5)"
echo " --context N Long-context depth in tokens (default: 32768)" echo " --context N Long-context depth in tokens (default: 32768)"
echo " --pp N Prompt processing tokens (default: 512)"
echo " --tg N Token generation count (default: 128)"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB" echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE" echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE"
echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic"
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run" echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
exit 0 ;; exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;; *) log_warn "Unknown argument: $1"; shift ;;
@@ -182,12 +189,15 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
fi fi
# Standard test (pp512 + tg128, default context) # Standard test
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" local_suffix="fa1"
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
if [[ ! -s "$OUT" ]]; then if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME" printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -r "$REPS_STANDARD") -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
printf " cmd: %s\n" "${CMD[*]}" printf " cmd: %s\n" "${CMD[*]}"
if "${CMD[@]}" > "$OUT" 2>&1; then if "${CMD[@]}" > "$OUT" 2>&1; then

View File

@@ -18,6 +18,8 @@ REPS_STANDARD=5
REPS_LONGCTX=3 REPS_LONGCTX=3
CTX_DEPTH=32768 CTX_DEPTH=32768
CTX_PROMPT=2048 CTX_PROMPT=2048
PP_TOKENS=512
TG_TOKENS=128
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@@ -29,6 +31,8 @@ while [[ $# -gt 0 ]]; do
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;; --category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
--reps|-r) REPS_STANDARD="$2"; shift 2 ;; --reps|-r) REPS_STANDARD="$2"; shift 2 ;;
--context|-d) CTX_DEPTH="$2"; shift 2 ;; --context|-d) CTX_DEPTH="$2"; shift 2 ;;
--pp) PP_TOKENS="$2"; shift 2 ;;
--tg) TG_TOKENS="$2"; shift 2 ;;
--help|-h) --help|-h)
echo "Usage: run-suite.sh [OPTIONS]" echo "Usage: run-suite.sh [OPTIONS]"
echo "" echo ""
@@ -41,9 +45,12 @@ while [[ $# -gt 0 ]]; do
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)" echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
echo " --reps N Standard test repetitions (default: 5)" echo " --reps N Standard test repetitions (default: 5)"
echo " --context N Long-context depth in tokens (default: 32768)" echo " --context N Long-context depth in tokens (default: 32768)"
echo " --pp N Prompt processing tokens (default: 512)"
echo " --tg N Token generation count (default: 128)"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " run-suite.sh --tag ctx128k --context 131072 --category moe" echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx" echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
exit 0 ;; exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;; *) log_warn "Unknown argument: $1"; shift ;;
@@ -170,11 +177,14 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
fi fi
# Standard test # Standard test
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" local_suffix="fa1"
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
if [[ ! -s "$OUT" ]]; then if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard${RESET}\n" "$BACKEND" "$MODEL_NAME" printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -r "$REPS_STANDARD") -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
if "${CMD[@]}" > "$OUT" 2>&1; then if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Done"; tail -3 "$OUT" log_success "Done"; tail -3 "$OUT"
else else