feat(benchmark): add -b/--batch flag, test MoE batch size impact
Add batch size override to benchmark scripts. Testing -b 256 vs default 2048 on Vulkan RADV shows no meaningful difference for MoE pp2048 (826 vs 843 t/s, within noise). Community-reported +70% improvement does not reproduce on this backend.
This commit is contained in:
@@ -104,11 +104,14 @@ Living document tracking what was applied, tested, and the actual results. Each
|
|||||||
|
|
||||||
### 3.2 MoE Batch Size `-b 256`
|
### 3.2 MoE Batch Size `-b 256`
|
||||||
|
|
||||||
- **Date**: PENDING
|
- **Date**: 2026-03-30
|
||||||
- **Change**: Add `-b 256` to MoE benchmark runs
|
- **Change**: `-b 256` vs default (2048)
|
||||||
- **Expected**: Up to +70% pp improvement for MoE models (community benchmarks)
|
- **Benchmark**: `data/benchmarks/batch-default-*` vs `data/benchmarks/batch-256-*`
|
||||||
- **Benchmark**: Not yet run
|
- **Result** (Vulkan RADV, Qwen3.5-35B-A3B UD-Q4_K_XL, q4_0 KV):
|
||||||
- **Verdict**: PENDING
|
- Default: 826 pp, 55.9 tg
|
||||||
|
- b=256: 843 pp, 55.5 tg (within noise)
|
||||||
|
- **Notes**: Community-reported +70% improvement does not reproduce on Vulkan RADV. May only apply to ROCm or CPU backends, or to longer prompts (pp8192+).
|
||||||
|
- **Verdict**: NO IMPACT on Vulkan — not recommended
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
@@ -32,6 +33,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
-b|--batch) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-baseline.sh [OPTIONS]"
|
echo "Usage: run-baseline.sh [OPTIONS]"
|
||||||
@@ -44,6 +46,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
echo " Types: f16, q8_0, q4_0, q4_1"
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
@@ -220,16 +223,24 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Build batch size args
|
||||||
|
BATCH_ARGS=()
|
||||||
|
BATCH_SUFFIX=""
|
||||||
|
if [[ -n "$BATCH_SIZE" ]]; then
|
||||||
|
BATCH_ARGS+=(-b "$BATCH_SIZE")
|
||||||
|
BATCH_SUFFIX="__b${BATCH_SIZE}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Standard test
|
# Standard test
|
||||||
local_suffix="fa1"
|
local_suffix="fa1"
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log"
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
if [[ ! -s "$OUT" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}")
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD[*]}"
|
printf " cmd: %s\n" "${CMD[*]}"
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
@@ -248,7 +259,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \
|
printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \
|
||||||
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||||||
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
@@ -34,6 +35,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
-b|--batch) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-suite.sh [OPTIONS]"
|
echo "Usage: run-suite.sh [OPTIONS]"
|
||||||
@@ -49,6 +51,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
echo " Types: f16, q8_0, q4_0, q4_1"
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
@@ -213,16 +216,24 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Build batch size args
|
||||||
|
BATCH_ARGS=()
|
||||||
|
BATCH_SUFFIX=""
|
||||||
|
if [[ -n "$BATCH_SIZE" ]]; then
|
||||||
|
BATCH_ARGS+=(-b "$BATCH_SIZE")
|
||||||
|
BATCH_SUFFIX="__b${BATCH_SIZE}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Standard test
|
# Standard test
|
||||||
local_suffix="fa1"
|
local_suffix="fa1"
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log"
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
if [[ ! -s "$OUT" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}")
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT"
|
log_success "Done"; tail -3 "$OUT"
|
||||||
else
|
else
|
||||||
@@ -234,7 +245,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
if $SKIP_LONGCTX; then
|
if $SKIP_LONGCTX; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
|
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
|
||||||
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
|
|||||||
Reference in New Issue
Block a user