feat(benchmark): add --kv-types flag for KV cache quantization sweep
This commit is contained in:
@@ -23,10 +23,12 @@ case "${1:-help}" in
|
|||||||
echo " --category LIST Comma-separated: smoke,dense,moe"
|
echo " --category LIST Comma-separated: smoke,dense,moe"
|
||||||
echo " --skip-longctx Skip long-context (32K) tests"
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " benchmark baseline --max-size 20 --skip-longctx"
|
echo " benchmark baseline --max-size 20 --skip-longctx"
|
||||||
echo " benchmark run --tag post-opt --category moe"
|
echo " benchmark run --tag post-opt --category moe"
|
||||||
|
echo " benchmark run --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -31,6 +32,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-baseline.sh [OPTIONS]"
|
echo "Usage: run-baseline.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -42,11 +44,15 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
|
echo " run-baseline.sh --max-size 20 # Only models ≤20 GB"
|
||||||
echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE"
|
echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE"
|
||||||
echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic"
|
echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic"
|
||||||
|
echo " run-baseline.sh --kv-types f16,q8_0,q4_0 --context 131072 # KV sweep"
|
||||||
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
|
echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run"
|
||||||
exit 0 ;;
|
exit 0 ;;
|
||||||
*) log_warn "Unknown argument: $1"; shift ;;
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
@@ -59,11 +65,19 @@ if (( CTX_DEPTH > 32768 )); then
|
|||||||
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Parse KV cache types for sweep
|
||||||
|
if [[ -n "$KV_TYPES_RAW" ]]; then
|
||||||
|
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
|
||||||
|
else
|
||||||
|
KV_TYPES=("f16")
|
||||||
|
fi
|
||||||
|
|
||||||
log_header "Baseline Benchmark Capture"
|
log_header "Baseline Benchmark Capture"
|
||||||
log_info "Results will be saved to: $RESULT_DIR"
|
log_info "Results will be saved to: $RESULT_DIR"
|
||||||
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
|
$SKIP_LONGCTX && log_info "Long-context tests: SKIPPED"
|
||||||
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
|
(( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB"
|
||||||
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
|
[[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER"
|
||||||
|
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
|
||||||
|
|
||||||
# ── 1. Save system state ────────────────────────────────
|
# ── 1. Save system state ────────────────────────────────
|
||||||
log_info "Capturing system state..."
|
log_info "Capturing system state..."
|
||||||
@@ -165,9 +179,8 @@ log_info "Metric logger started (PID: $METRICS_PID)"
|
|||||||
cleanup() {
|
cleanup() {
|
||||||
kill "$METRICS_PID" 2>/dev/null || true
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
wait "$METRICS_PID" 2>/dev/null || true
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
return 0
|
|
||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# ── 5. Run benchmarks ───────────────────────────────────
|
# ── 5. Run benchmarks ───────────────────────────────────
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
@@ -189,15 +202,34 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for KV_SPEC in "${KV_TYPES[@]}"; do
|
||||||
|
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
|
||||||
|
if [[ "$KV_SPEC" == *:* ]]; then
|
||||||
|
KV_K="${KV_SPEC%%:*}"
|
||||||
|
KV_V="${KV_SPEC##*:}"
|
||||||
|
else
|
||||||
|
KV_K="$KV_SPEC"
|
||||||
|
KV_V="$KV_SPEC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build KV cache args (skip for f16 — it's the default)
|
||||||
|
KV_ARGS=()
|
||||||
|
KV_SUFFIX=""
|
||||||
|
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||||
|
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||||
|
KV_SUFFIX="__kv_${KV_K}_${KV_V}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Standard test
|
# Standard test
|
||||||
local_suffix="fa1"
|
local_suffix="fa1"
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
if [[ ! -s "$OUT" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD[*]}"
|
printf " cmd: %s\n" "${CMD[*]}"
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
@@ -211,14 +243,15 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
log_info "Skipping standard test (log exists): $OUT"
|
log_info "Skipping standard test (log exists): $OUT"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Long-context test (pp2048, tg32, ctx 32768)
|
# Long-context test
|
||||||
if $SKIP_LONGCTX; then
|
if $SKIP_LONGCTX; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — long-context %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
|
printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
|
|
||||||
UB_SIZE=2048
|
UB_SIZE=2048
|
||||||
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
@@ -226,7 +259,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||||
-r "$REPS_LONGCTX")
|
-r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
@@ -239,6 +272,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
else
|
else
|
||||||
log_info "Skipping long-context test (log exists): $OUT_LC"
|
log_info "Skipping long-context test (log exists): $OUT_LC"
|
||||||
fi
|
fi
|
||||||
|
done # KV_TYPES
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -258,6 +292,10 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
if "FAILED" in content:
|
if "FAILED" in content:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Extract KV cache type from filename (__kv_q8_0_q8_0)
|
||||||
|
kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
|
||||||
|
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||||
|
|
||||||
for line in content.splitlines():
|
for line in content.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||||||
@@ -286,6 +324,7 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
"backend": parts[4].strip(),
|
"backend": parts[4].strip(),
|
||||||
"test": test_type,
|
"test": test_type,
|
||||||
"tokens_per_sec": float(ts_match.group(1)),
|
"tokens_per_sec": float(ts_match.group(1)),
|
||||||
|
"kv_cache": kv_type,
|
||||||
"raw": ts_raw,
|
"raw": ts_raw,
|
||||||
})
|
})
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
@@ -307,13 +346,14 @@ if not data["results"]:
|
|||||||
print(" No results parsed. Check log files for errors.")
|
print(" No results parsed. Check log files for errors.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
|
||||||
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
|
||||||
print(" " + "-" * 58)
|
print(" " + "-" * 68)
|
||||||
for r in data["results"]:
|
for r in data["results"]:
|
||||||
print(fmt.format(
|
print(fmt.format(
|
||||||
r["model"][:20],
|
r["model"][:20],
|
||||||
r["backend"][:16],
|
r["backend"][:16],
|
||||||
|
r.get("kv_cache", "f16/f16")[:10],
|
||||||
r["test"],
|
r["test"],
|
||||||
f"{r['tokens_per_sec']:.2f}"
|
f"{r['tokens_per_sec']:.2f}"
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ CTX_DEPTH=32768
|
|||||||
CTX_PROMPT=2048
|
CTX_PROMPT=2048
|
||||||
PP_TOKENS=512
|
PP_TOKENS=512
|
||||||
TG_TOKENS=128
|
TG_TOKENS=128
|
||||||
|
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -33,6 +34,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||||||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||||||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||||||
|
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: run-suite.sh [OPTIONS]"
|
echo "Usage: run-suite.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -47,10 +49,15 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --context N Long-context depth in tokens (default: 32768)"
|
echo " --context N Long-context depth in tokens (default: 32768)"
|
||||||
echo " --pp N Prompt processing tokens (default: 512)"
|
echo " --pp N Prompt processing tokens (default: 512)"
|
||||||
echo " --tg N Token generation count (default: 128)"
|
echo " --tg N Token generation count (default: 128)"
|
||||||
|
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||||||
|
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||||||
|
echo " Types: f16, q8_0, q4_0, q4_1"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
|
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
|
||||||
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
|
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
|
||||||
|
echo " run-suite.sh --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
|
||||||
|
echo " run-suite.sh --tag kv-mixed --kv-types q8_0,q4_0:q8_0 --context 131072"
|
||||||
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
|
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
|
||||||
exit 0 ;;
|
exit 0 ;;
|
||||||
*) log_warn "Unknown argument: $1"; shift ;;
|
*) log_warn "Unknown argument: $1"; shift ;;
|
||||||
@@ -63,12 +70,20 @@ if (( CTX_DEPTH > 32768 )); then
|
|||||||
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Parse KV cache types for sweep
|
||||||
|
if [[ -n "$KV_TYPES_RAW" ]]; then
|
||||||
|
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
|
||||||
|
else
|
||||||
|
KV_TYPES=("f16")
|
||||||
|
fi
|
||||||
|
|
||||||
TS="$(timestamp)"
|
TS="$(timestamp)"
|
||||||
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
||||||
mkdir -p "$RESULT_DIR"
|
mkdir -p "$RESULT_DIR"
|
||||||
|
|
||||||
log_header "Benchmark Suite: $TAG"
|
log_header "Benchmark Suite: $TAG"
|
||||||
log_info "Results: $RESULT_DIR"
|
log_info "Results: $RESULT_DIR"
|
||||||
|
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
|
||||||
|
|
||||||
# Save system state
|
# Save system state
|
||||||
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
||||||
@@ -157,7 +172,11 @@ log_info "Models: ${#MODEL_PATHS[@]}"
|
|||||||
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
||||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
||||||
METRICS_PID=$!
|
METRICS_PID=$!
|
||||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null; true' EXIT
|
cleanup() {
|
||||||
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# Run benchmarks (same logic as run-baseline.sh)
|
# Run benchmarks (same logic as run-baseline.sh)
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
@@ -176,15 +195,34 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for KV_SPEC in "${KV_TYPES[@]}"; do
|
||||||
|
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
|
||||||
|
if [[ "$KV_SPEC" == *:* ]]; then
|
||||||
|
KV_K="${KV_SPEC%%:*}"
|
||||||
|
KV_V="${KV_SPEC##*:}"
|
||||||
|
else
|
||||||
|
KV_K="$KV_SPEC"
|
||||||
|
KV_V="$KV_SPEC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build KV cache args (skip for f16 — it's the default)
|
||||||
|
KV_ARGS=()
|
||||||
|
KV_SUFFIX=""
|
||||||
|
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||||
|
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||||
|
KV_SUFFIX="__kv_${KV_K}_${KV_V}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Standard test
|
# Standard test
|
||||||
local_suffix="fa1"
|
local_suffix="fa1"
|
||||||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||||||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log"
|
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT" ]]; then
|
if [[ ! -s "$OUT" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS"
|
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||||||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD")
|
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}")
|
||||||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT"
|
log_success "Done"; tail -3 "$OUT"
|
||||||
else
|
else
|
||||||
@@ -196,19 +234,21 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
if $SKIP_LONGCTX; then
|
if $SKIP_LONGCTX; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log"
|
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log"
|
||||||
if [[ ! -s "$OUT_LC" ]]; then
|
if [[ ! -s "$OUT_LC" ]]; then
|
||||||
printf "\n${BOLD}>> [%s] %s — longctx %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH"
|
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
|
||||||
|
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX")
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT_LC"
|
log_success "Done"; tail -3 "$OUT_LC"
|
||||||
else
|
else
|
||||||
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
|
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
done # KV_TYPES
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -226,6 +266,11 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
content = logfile.read_text()
|
content = logfile.read_text()
|
||||||
if "FAILED" in content:
|
if "FAILED" in content:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Extract KV cache type from filename (__kv_q8_0_q8_0)
|
||||||
|
kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
|
||||||
|
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||||
|
|
||||||
for line in content.splitlines():
|
for line in content.splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||||||
@@ -248,6 +293,7 @@ for logfile in sorted(result_dir.glob("*.log")):
|
|||||||
"backend": parts[4].strip(),
|
"backend": parts[4].strip(),
|
||||||
"test": test_type,
|
"test": test_type,
|
||||||
"tokens_per_sec": float(ts_match.group(1)),
|
"tokens_per_sec": float(ts_match.group(1)),
|
||||||
|
"kv_cache": kv_type,
|
||||||
"raw": ts_raw,
|
"raw": ts_raw,
|
||||||
})
|
})
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
@@ -264,11 +310,14 @@ with open(sys.argv[1]) as f:
|
|||||||
if not data["results"]:
|
if not data["results"]:
|
||||||
print(" No results parsed.")
|
print(" No results parsed.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
fmt = " {:<20} {:<16} {:<8} {:>10}"
|
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
|
||||||
print(fmt.format("Model", "Backend", "Test", "t/s"))
|
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
|
||||||
print(" " + "-" * 58)
|
print(" " + "-" * 68)
|
||||||
for r in data["results"]:
|
for r in data["results"]:
|
||||||
print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}"))
|
print(fmt.format(
|
||||||
|
r["model"][:20], r["backend"][:16],
|
||||||
|
r.get("kv_cache", "f16/f16")[:10], r["test"],
|
||||||
|
f"{r['tokens_per_sec']:.2f}"))
|
||||||
PYEOF
|
PYEOF
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "run-suite --help shows usage and exits 0" {
|
@test "run-suite --help shows usage and exits 0" {
|
||||||
@@ -20,6 +21,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--tag"
|
assert_output --partial "--tag"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher shows help with no args" {
|
@test "benchmark dispatcher shows help with no args" {
|
||||||
@@ -28,6 +30,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "Commands"
|
assert_output --partial "Commands"
|
||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
|
assert_output --partial "--kv-types"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher passes --help through to baseline" {
|
@test "benchmark dispatcher passes --help through to baseline" {
|
||||||
|
|||||||
Reference in New Issue
Block a user