diff --git a/docs/optimization-log.md b/docs/optimization-log.md index 83c28f0..1dfca28 100644 --- a/docs/optimization-log.md +++ b/docs/optimization-log.md @@ -104,11 +104,14 @@ Living document tracking what was applied, tested, and the actual results. Each ### 3.2 MoE Batch Size `-b 256` -- **Date**: PENDING -- **Change**: Add `-b 256` to MoE benchmark runs -- **Expected**: Up to +70% pp improvement for MoE models (community benchmarks) -- **Benchmark**: Not yet run -- **Verdict**: PENDING +- **Date**: 2026-03-30 +- **Change**: `-b 256` vs default (2048) +- **Benchmark**: `data/benchmarks/batch-default-*` vs `data/benchmarks/batch-256-*` +- **Result** (Vulkan RADV, Qwen3.5-35B-A3B UD-Q4_K_XL, q4_0 KV): + - Default: 826 pp, 55.9 tg + - b=256: 843 pp, 55.5 tg (within noise) +- **Notes**: Community-reported +70% improvement does not reproduce on Vulkan RADV. May only apply to ROCm or CPU backends, or to longer prompts (pp8192+). +- **Verdict**: NO IMPACT on Vulkan — not recommended --- diff --git a/scripts/benchmark/run-baseline.sh b/scripts/benchmark/run-baseline.sh index fe5b8cc..99567d0 100644 --- a/scripts/benchmark/run-baseline.sh +++ b/scripts/benchmark/run-baseline.sh @@ -21,6 +21,7 @@ CTX_DEPTH=32768 CTX_PROMPT=2048 PP_TOKENS=512 TG_TOKENS=128 +BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048) KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0) while [[ $# -gt 0 ]]; do @@ -32,6 +33,7 @@ while [[ $# -gt 0 ]]; do --context|-d) CTX_DEPTH="$2"; shift 2 ;; --pp) PP_TOKENS="$2"; shift 2 ;; --tg) TG_TOKENS="$2"; shift 2 ;; + -b|--batch) BATCH_SIZE="$2"; shift 2 ;; --kv-types) KV_TYPES_RAW="$2"; shift 2 ;; --help|-h) echo "Usage: run-baseline.sh [OPTIONS]" @@ -44,6 +46,7 @@ while [[ $# -gt 0 ]]; do echo " --context N Long-context depth in tokens (default: 32768)" echo " --pp N Prompt processing tokens (default: 512)" echo " --tg N Token generation count (default: 128)" + echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)" echo " --kv-types LIST KV cache sweep: comma-separated types to test" echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE" echo " Types: f16, q8_0, q4_0, q4_1" @@ -220,16 +223,24 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do KV_SUFFIX="__kv_${KV_K}-${KV_V}" fi + # Build batch size args + BATCH_ARGS=() + BATCH_SUFFIX="" + if [[ -n "$BATCH_SIZE" ]]; then + BATCH_ARGS+=(-b "$BATCH_SIZE") + BATCH_SUFFIX="__b${BATCH_SIZE}" + fi + # Standard test local_suffix="fa1" [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" - OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log" + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT" ]]; then printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}") + -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}") printf " cmd: %s\n" "${CMD[*]}" if "${CMD[@]}" > "$OUT" 2>&1; then @@ -248,7 +259,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do continue fi - OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log" + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}" diff --git a/scripts/benchmark/run-suite.sh b/scripts/benchmark/run-suite.sh index a1677ea..bd3f4f0 100644 --- a/scripts/benchmark/run-suite.sh +++ b/scripts/benchmark/run-suite.sh @@ -20,6 +20,7 @@ CTX_DEPTH=32768 CTX_PROMPT=2048 PP_TOKENS=512 TG_TOKENS=128 +BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048) KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0) while [[ $# -gt 0 ]]; do @@ -34,6 +35,7 @@ while [[ $# -gt 0 ]]; do --context|-d) CTX_DEPTH="$2"; shift 2 ;; --pp) PP_TOKENS="$2"; shift 2 ;; --tg) TG_TOKENS="$2"; shift 2 ;; + -b|--batch) BATCH_SIZE="$2"; shift 2 ;; --kv-types) KV_TYPES_RAW="$2"; shift 2 ;; --help|-h) echo "Usage: run-suite.sh [OPTIONS]" @@ -49,6 +51,7 @@ while [[ $# -gt 0 ]]; do echo " --context N Long-context depth in tokens (default: 32768)" echo " --pp N Prompt processing tokens (default: 512)" echo " --tg N Token generation count (default: 128)" + echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)" echo " --kv-types LIST KV cache sweep: comma-separated types to test" echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE" echo " Types: f16, q8_0, q4_0, q4_1" @@ -213,16 +216,24 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do KV_SUFFIX="__kv_${KV_K}-${KV_V}" fi + # Build batch size args + BATCH_ARGS=() + BATCH_SUFFIX="" + if [[ -n "$BATCH_SIZE" ]]; then + BATCH_ARGS+=(-b "$BATCH_SIZE") + BATCH_SUFFIX="__b${BATCH_SIZE}" + fi + # Standard test local_suffix="fa1" [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" - OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log" + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT" ]]; then printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}" CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}") + -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}") if "${CMD[@]}" > "$OUT" 2>&1; then log_success "Done"; tail -3 "$OUT" else @@ -234,7 +245,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do if $SKIP_LONGCTX; then continue fi - OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log" + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log" if [[ ! -s "$OUT_LC" ]]; then printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \ "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"