diff --git a/bin/benchmark b/bin/benchmark index 4bdc124..6df7533 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -23,10 +23,12 @@ case "${1:-help}" in echo " --category LIST Comma-separated: smoke,dense,moe" echo " --skip-longctx Skip long-context (32K) tests" echo " --reps N Standard test repetitions (default: 5)" + echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)" echo "" echo "Examples:" echo " benchmark baseline --max-size 20 --skip-longctx" echo " benchmark run --tag post-opt --category moe" + echo " benchmark run --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072" exit 1 ;; esac diff --git a/scripts/benchmark/run-baseline.sh b/scripts/benchmark/run-baseline.sh index c5104be..700766b 100644 --- a/scripts/benchmark/run-baseline.sh +++ b/scripts/benchmark/run-baseline.sh @@ -21,6 +21,7 @@ CTX_DEPTH=32768 CTX_PROMPT=2048 PP_TOKENS=512 TG_TOKENS=128 +KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0) while [[ $# -gt 0 ]]; do case "$1" in @@ -31,6 +32,7 @@ while [[ $# -gt 0 ]]; do --context|-d) CTX_DEPTH="$2"; shift 2 ;; --pp) PP_TOKENS="$2"; shift 2 ;; --tg) TG_TOKENS="$2"; shift 2 ;; + --kv-types) KV_TYPES_RAW="$2"; shift 2 ;; --help|-h) echo "Usage: run-baseline.sh [OPTIONS]" echo "" @@ -42,11 +44,15 @@ while [[ $# -gt 0 ]]; do echo " --context N Long-context depth in tokens (default: 32768)" echo " --pp N Prompt processing tokens (default: 512)" echo " --tg N Token generation count (default: 128)" + echo " --kv-types LIST KV cache sweep: comma-separated types to test" + echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE" + echo " Types: f16, q8_0, q4_0, q4_1" echo "" echo "Examples:" echo " run-baseline.sh --max-size 20 # Only models ≤20 GB" echo " run-baseline.sh --context 131072 --category moe # 128K context on MoE" echo " run-baseline.sh --tg 1024 --pp 2048 --category moe # Realistic agentic" + echo " run-baseline.sh --kv-types f16,q8_0,q4_0 --context 131072 # KV sweep" echo " run-baseline.sh --skip-longctx --max-size 15 # Quick safe run" exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; @@ -59,11 +65,19 @@ if (( CTX_DEPTH > 32768 )); then (( CTX_PROMPT < 512 )) && CTX_PROMPT=512 fi +# Parse KV cache types for sweep +if [[ -n "$KV_TYPES_RAW" ]]; then + IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW" +else + KV_TYPES=("f16") +fi + log_header "Baseline Benchmark Capture" log_info "Results will be saved to: $RESULT_DIR" $SKIP_LONGCTX && log_info "Long-context tests: SKIPPED" (( MAX_SIZE_GB > 0 )) && log_info "Max model size: ${MAX_SIZE_GB} GB" [[ -n "$CATEGORY_FILTER" ]] && log_info "Categories: $CATEGORY_FILTER" +(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}" # ── 1. Save system state ──────────────────────────────── log_info "Capturing system state..." @@ -165,9 +179,8 @@ log_info "Metric logger started (PID: $METRICS_PID)" cleanup() { kill "$METRICS_PID" 2>/dev/null || true wait "$METRICS_PID" 2>/dev/null || true - return 0 } -trap cleanup EXIT +trap 'cleanup; exit 0' EXIT # ── 5. Run benchmarks ─────────────────────────────────── for MODEL_PATH in "${MODEL_PATHS[@]}"; do @@ -189,56 +202,77 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" fi - # Standard test - local_suffix="fa1" - [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" - OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log" - if [[ ! -s "$OUT" ]]; then - printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" - CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" - -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD") - - printf " cmd: %s\n" "${CMD[*]}" - if "${CMD[@]}" > "$OUT" 2>&1; then - log_success "Standard test complete" - tail -5 "$OUT" + for KV_SPEC in "${KV_TYPES[@]}"; do + # Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0 + if [[ "$KV_SPEC" == *:* ]]; then + KV_K="${KV_SPEC%%:*}" + KV_V="${KV_SPEC##*:}" else - log_error "Standard test failed (exit $?)" - echo "FAILED" >> "$OUT" + KV_K="$KV_SPEC" + KV_V="$KV_SPEC" fi - else - log_info "Skipping standard test (log exists): $OUT" - fi - # Long-context test (pp2048, tg32, ctx 32768) - if $SKIP_LONGCTX; then - continue - fi + # Build KV cache args (skip for f16 — it's the default) + KV_ARGS=() + KV_SUFFIX="" + if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then + KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V") + KV_SUFFIX="__kv_${KV_K}_${KV_V}" + fi - OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log" - if [[ ! -s "$OUT_LC" ]]; then - printf "\n${BOLD}>> [%s] %s — long-context %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" + # Standard test + local_suffix="fa1" + [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log" + if [[ ! -s "$OUT" ]]; then + printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \ + "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}" + CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 + -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}") - UB_SIZE=2048 - [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 - - CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" - -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" - -r "$REPS_LONGCTX") - - printf " cmd: %s\n" "${CMD_LC[*]}" - if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then - log_success "Long-context test complete" - tail -5 "$OUT_LC" + printf " cmd: %s\n" "${CMD[*]}" + if "${CMD[@]}" > "$OUT" 2>&1; then + log_success "Standard test complete" + tail -5 "$OUT" + else + log_error "Standard test failed (exit $?)" + echo "FAILED" >> "$OUT" + fi else - log_error "Long-context test failed (exit $?)" - echo "FAILED" >> "$OUT_LC" + log_info "Skipping standard test (log exists): $OUT" fi - else - log_info "Skipping long-context test (log exists): $OUT_LC" - fi + + # Long-context test + if $SKIP_LONGCTX; then + continue + fi + + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log" + if [[ ! -s "$OUT_LC" ]]; then + printf "\n${BOLD}>> [%s] %s — long-context %s KV=%s${RESET}\n" \ + "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}" + + UB_SIZE=2048 + [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 + + CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 + -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" + -r "$REPS_LONGCTX" "${KV_ARGS[@]}") + + printf " cmd: %s\n" "${CMD_LC[*]}" + if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then + log_success "Long-context test complete" + tail -5 "$OUT_LC" + else + log_error "Long-context test failed (exit $?)" + echo "FAILED" >> "$OUT_LC" + fi + else + log_info "Skipping long-context test (log exists): $OUT_LC" + fi + done # KV_TYPES done done @@ -258,6 +292,10 @@ for logfile in sorted(result_dir.glob("*.log")): if "FAILED" in content: continue + # Extract KV cache type from filename (__kv_q8_0_q8_0) + kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name) + kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16" + for line in content.splitlines(): line = line.strip() if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()): @@ -286,6 +324,7 @@ for logfile in sorted(result_dir.glob("*.log")): "backend": parts[4].strip(), "test": test_type, "tokens_per_sec": float(ts_match.group(1)), + "kv_cache": kv_type, "raw": ts_raw, }) except (ValueError, IndexError): @@ -307,13 +346,14 @@ if not data["results"]: print(" No results parsed. Check log files for errors.") sys.exit(0) -fmt = " {:<20} {:<16} {:<8} {:>10}" -print(fmt.format("Model", "Backend", "Test", "t/s")) -print(" " + "-" * 58) +fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}" +print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s")) +print(" " + "-" * 68) for r in data["results"]: print(fmt.format( r["model"][:20], r["backend"][:16], + r.get("kv_cache", "f16/f16")[:10], r["test"], f"{r['tokens_per_sec']:.2f}" )) diff --git a/scripts/benchmark/run-suite.sh b/scripts/benchmark/run-suite.sh index 7049298..e342698 100644 --- a/scripts/benchmark/run-suite.sh +++ b/scripts/benchmark/run-suite.sh @@ -20,6 +20,7 @@ CTX_DEPTH=32768 CTX_PROMPT=2048 PP_TOKENS=512 TG_TOKENS=128 +KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0) while [[ $# -gt 0 ]]; do case "$1" in @@ -33,6 +34,7 @@ while [[ $# -gt 0 ]]; do --context|-d) CTX_DEPTH="$2"; shift 2 ;; --pp) PP_TOKENS="$2"; shift 2 ;; --tg) TG_TOKENS="$2"; shift 2 ;; + --kv-types) KV_TYPES_RAW="$2"; shift 2 ;; --help|-h) echo "Usage: run-suite.sh [OPTIONS]" echo "" @@ -47,10 +49,15 @@ while [[ $# -gt 0 ]]; do echo " --context N Long-context depth in tokens (default: 32768)" echo " --pp N Prompt processing tokens (default: 512)" echo " --tg N Token generation count (default: 128)" + echo " --kv-types LIST KV cache sweep: comma-separated types to test" + echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE" + echo " Types: f16, q8_0, q4_0, q4_1" echo "" echo "Examples:" echo " run-suite.sh --tag ctx128k --context 131072 --category moe" echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe" + echo " run-suite.sh --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072" + echo " run-suite.sh --tag kv-mixed --kv-types q8_0,q4_0:q8_0 --context 131072" echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx" exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; @@ -63,12 +70,20 @@ if (( CTX_DEPTH > 32768 )); then (( CTX_PROMPT < 512 )) && CTX_PROMPT=512 fi +# Parse KV cache types for sweep +if [[ -n "$KV_TYPES_RAW" ]]; then + IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW" +else + KV_TYPES=("f16") +fi + TS="$(timestamp)" RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}" mkdir -p "$RESULT_DIR" log_header "Benchmark Suite: $TAG" log_info "Results: $RESULT_DIR" +(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}" # Save system state bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null @@ -157,7 +172,11 @@ log_info "Models: ${#MODEL_PATHS[@]}" METRICS_FILE="$RESULT_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & METRICS_PID=$! -trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null; true' EXIT +cleanup() { + kill "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +} +trap 'cleanup; exit 0' EXIT # Run benchmarks (same logic as run-baseline.sh) for MODEL_PATH in "${MODEL_PATHS[@]}"; do @@ -176,39 +195,60 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" fi - # Standard test - local_suffix="fa1" - [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" - OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}.log" - if [[ ! -s "$OUT" ]]; then - printf "\n${BOLD}>> [%s] %s — pp%s/tg%s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" - CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" - -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD") - if "${CMD[@]}" > "$OUT" 2>&1; then - log_success "Done"; tail -3 "$OUT" + for KV_SPEC in "${KV_TYPES[@]}"; do + # Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0 + if [[ "$KV_SPEC" == *:* ]]; then + KV_K="${KV_SPEC%%:*}" + KV_V="${KV_SPEC##*:}" else - log_error "Failed"; echo "FAILED" >> "$OUT" + KV_K="$KV_SPEC" + KV_V="$KV_SPEC" fi - fi - # Long-context test - if $SKIP_LONGCTX; then - continue - fi - OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}.log" - if [[ ! -s "$OUT_LC" ]]; then - printf "\n${BOLD}>> [%s] %s — longctx %s${RESET}\n" "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" - UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 - CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" - -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX") - if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then - log_success "Done"; tail -3 "$OUT_LC" - else - log_error "Failed"; echo "FAILED" >> "$OUT_LC" + # Build KV cache args (skip for f16 — it's the default) + KV_ARGS=() + KV_SUFFIX="" + if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then + KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V") + KV_SUFFIX="__kv_${KV_K}_${KV_V}" fi - fi + + # Standard test + local_suffix="fa1" + [[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}" + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}.log" + if [[ ! -s "$OUT" ]]; then + printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \ + "$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}" + CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 + -p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${KV_ARGS[@]}") + if "${CMD[@]}" > "$OUT" 2>&1; then + log_success "Done"; tail -3 "$OUT" + else + log_error "Failed"; echo "FAILED" >> "$OUT" + fi + fi + + # Long-context test + if $SKIP_LONGCTX; then + continue + fi + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}.log" + if [[ ! -s "$OUT_LC" ]]; then + printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \ + "$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}" + UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 + CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 + -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}") + if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then + log_success "Done"; tail -3 "$OUT_LC" + else + log_error "Failed"; echo "FAILED" >> "$OUT_LC" + fi + fi + done # KV_TYPES done done @@ -226,6 +266,11 @@ for logfile in sorted(result_dir.glob("*.log")): content = logfile.read_text() if "FAILED" in content: continue + + # Extract KV cache type from filename (__kv_q8_0_q8_0) + kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name) + kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16" + for line in content.splitlines(): line = line.strip() if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()): @@ -248,6 +293,7 @@ for logfile in sorted(result_dir.glob("*.log")): "backend": parts[4].strip(), "test": test_type, "tokens_per_sec": float(ts_match.group(1)), + "kv_cache": kv_type, "raw": ts_raw, }) except (ValueError, IndexError): @@ -264,11 +310,14 @@ with open(sys.argv[1]) as f: if not data["results"]: print(" No results parsed.") sys.exit(0) -fmt = " {:<20} {:<16} {:<8} {:>10}" -print(fmt.format("Model", "Backend", "Test", "t/s")) -print(" " + "-" * 58) +fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}" +print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s")) +print(" " + "-" * 68) for r in data["results"]: - print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}")) + print(fmt.format( + r["model"][:20], r["backend"][:16], + r.get("kv_cache", "f16/f16")[:10], r["test"], + f"{r['tokens_per_sec']:.2f}")) PYEOF echo "" diff --git a/tests/benchmark_flags.bats b/tests/benchmark_flags.bats index 923a363..9bc0f2d 100644 --- a/tests/benchmark_flags.bats +++ b/tests/benchmark_flags.bats @@ -10,6 +10,7 @@ load test_helper.sh assert_output --partial "--max-size" assert_output --partial "--category" assert_output --partial "--skip-longctx" + assert_output --partial "--kv-types" } @test "run-suite --help shows usage and exits 0" { @@ -20,6 +21,7 @@ load test_helper.sh assert_output --partial "--category" assert_output --partial "--skip-longctx" assert_output --partial "--tag" + assert_output --partial "--kv-types" } @test "benchmark dispatcher shows help with no args" { @@ -28,6 +30,7 @@ load test_helper.sh assert_output --partial "Commands" assert_output --partial "--max-size" assert_output --partial "--skip-longctx" + assert_output --partial "--kv-types" } @test "benchmark dispatcher passes --help through to baseline" {