Serving: - make serve now launches Claude-distilled APEX 35B-A3B (16GB) with 2 parallel slots and 256K context as the daily driver - add serve-custom for ad-hoc model testing - add flush-gpu to reclaim unified memory after stuck runs Benchmarks: - default Vulkan-only backends (ROCm trails at long context) - add --backends filter to run-baseline.sh - fix backend filter substring bug (grep -qFx for exact line match) - fix model filter regex metacharacter bug (grep -qiF for literal) - respect --tg in long-context tests instead of hardcoded n=32 ROCm bump to 7.2.1 (kernel 6.18.4+ patch); keep 7.2 as optional. Catalog: - add mudler APEX I-Compact (Claude-distilled 35B, 17GB) - add 0xSero REAP-40 (pruned 122B-A10B, 46GB) - update download instructions: hf download (huggingface-cli is gone)
340 lines
12 KiB
Bash
340 lines
12 KiB
Bash
#!/usr/bin/env bash
|
||
# Full benchmark suite — run all backends × models with tagging
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||
source "$SCRIPT_DIR/../../lib/detect.sh"
|
||
source "$SCRIPT_DIR/../../lib/format.sh"
|
||
|
||
MODEL_DIR="$(data_dir models)"
|
||
TAG="run"
|
||
BACKENDS_FILTER="llama-vulkan-radv"
|
||
MODELS_FILTER=""
|
||
SKIP_LONGCTX=false
|
||
MAX_SIZE_GB=0
|
||
CATEGORY_FILTER=""
|
||
REPS_STANDARD=5
|
||
REPS_LONGCTX=3
|
||
CTX_DEPTH=32768
|
||
CTX_PROMPT=2048
|
||
PP_TOKENS=512
|
||
TG_TOKENS=128
|
||
BATCH_SIZE="" # Batch size override (-b flag, empty = llama-bench default 2048)
|
||
KV_TYPES_RAW="" # Comma-separated KV cache types to sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)
|
||
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--tag|-t) TAG="$2"; shift 2 ;;
|
||
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
|
||
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
|
||
--skip-longctx) SKIP_LONGCTX=true; shift ;;
|
||
--max-size|-s) MAX_SIZE_GB="$2"; shift 2 ;;
|
||
--category|-c) CATEGORY_FILTER="$2"; shift 2 ;;
|
||
--reps|-r) REPS_STANDARD="$2"; shift 2 ;;
|
||
--context|-d) CTX_DEPTH="$2"; shift 2 ;;
|
||
--pp) PP_TOKENS="$2"; shift 2 ;;
|
||
--tg) TG_TOKENS="$2"; shift 2 ;;
|
||
-b|--batch) BATCH_SIZE="$2"; shift 2 ;;
|
||
--kv-types) KV_TYPES_RAW="$2"; shift 2 ;;
|
||
--help|-h)
|
||
echo "Usage: run-suite.sh [OPTIONS]"
|
||
echo ""
|
||
echo "Options:"
|
||
echo " --tag NAME Tag this run (default: run)"
|
||
echo " --backends LIST Comma-separated backend filter"
|
||
echo " --models LIST Comma-separated model filename filter"
|
||
echo " --skip-longctx Skip long-context tests"
|
||
echo " --max-size GB Only bench models up to this file size in GB"
|
||
echo " --category LIST Comma-separated: smoke,dense,moe (from models.conf)"
|
||
echo " --reps N Standard test repetitions (default: 5)"
|
||
echo " --context N Long-context depth in tokens (default: 32768)"
|
||
echo " --pp N Prompt processing tokens (default: 512)"
|
||
echo " --tg N Token generation count (default: 128)"
|
||
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||
echo " --kv-types LIST KV cache sweep: comma-separated types to test"
|
||
echo " Each entry: TYPE (both K+V) or K_TYPE:V_TYPE"
|
||
echo " Types: f16, q8_0, q4_0, q4_1"
|
||
echo ""
|
||
echo "Examples:"
|
||
echo " run-suite.sh --tag ctx128k --context 131072 --category moe"
|
||
echo " run-suite.sh --tag realistic --tg 1024 --pp 2048 --category moe"
|
||
echo " run-suite.sh --tag kv-sweep --kv-types f16,q8_0,q4_0 --context 131072"
|
||
echo " run-suite.sh --tag kv-mixed --kv-types q8_0,q4_0:q8_0 --context 131072"
|
||
echo " run-suite.sh --tag post-opt --max-size 20 --skip-longctx"
|
||
exit 0 ;;
|
||
*) log_warn "Unknown argument: $1"; shift ;;
|
||
esac
|
||
done
|
||
|
||
# Scale prompt tokens for context: use ~1/16 of context depth, min 512
|
||
if (( CTX_DEPTH > 32768 )); then
|
||
CTX_PROMPT=$(( CTX_DEPTH / 16 ))
|
||
(( CTX_PROMPT < 512 )) && CTX_PROMPT=512
|
||
fi
|
||
|
||
# Parse KV cache types for sweep
|
||
if [[ -n "$KV_TYPES_RAW" ]]; then
|
||
IFS=',' read -ra KV_TYPES <<< "$KV_TYPES_RAW"
|
||
else
|
||
KV_TYPES=("f16")
|
||
fi
|
||
|
||
TS="$(timestamp)"
|
||
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
|
||
mkdir -p "$RESULT_DIR"
|
||
|
||
log_header "Benchmark Suite: $TAG"
|
||
log_info "Results: $RESULT_DIR"
|
||
(( ${#KV_TYPES[@]} > 1 )) && log_info "KV cache sweep: ${KV_TYPES[*]}"
|
||
|
||
# Save system state
|
||
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
|
||
|
||
# Discover backends
|
||
existing="$(detect_toolbox_names 2>/dev/null || true)"
|
||
|
||
declare -A BENCH_PATHS=(
|
||
[llama-vulkan-radv]="/usr/sbin/llama-bench"
|
||
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
|
||
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
|
||
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
|
||
[llama-rocm-7.2.1]="/usr/local/bin/llama-bench"
|
||
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||
)
|
||
|
||
available_backends=()
|
||
for tb in "${!BENCH_PATHS[@]}"; do
|
||
if echo "$existing" | grep -q "^${tb}$"; then
|
||
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -qFx "$tb"; then
|
||
available_backends+=("$tb")
|
||
fi
|
||
fi
|
||
done
|
||
|
||
if (( ${#available_backends[@]} == 0 )); then
|
||
log_error "No matching backends. Run: make benchmark-setup"
|
||
exit 1
|
||
fi
|
||
log_info "Backends: ${available_backends[*]}"
|
||
|
||
# Find and filter models
|
||
mapfile -t ALL_MODEL_PATHS < <(
|
||
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
||
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
|
||
| sort
|
||
)
|
||
|
||
MODEL_PATHS=()
|
||
for p in "${ALL_MODEL_PATHS[@]}"; do
|
||
local_name="$(basename "$p")"
|
||
|
||
# Name filter
|
||
if [[ -n "$MODELS_FILTER" ]]; then
|
||
if ! echo "$MODELS_FILTER" | tr ',' '\n' | grep -qiF "$local_name"; then
|
||
continue
|
||
fi
|
||
fi
|
||
|
||
# Size filter
|
||
if (( MAX_SIZE_GB > 0 )); then
|
||
file_size_gb=$(( $(stat -Lc%s "$p" 2>/dev/null || echo 0) / 1073741824 ))
|
||
if (( file_size_gb >= MAX_SIZE_GB )); then
|
||
log_info "Skipping $local_name (${file_size_gb} GB > ${MAX_SIZE_GB} GB limit)"
|
||
continue
|
||
fi
|
||
fi
|
||
|
||
# Category filter
|
||
if [[ -n "$CATEGORY_FILTER" ]] && [[ -f "$PROJECT_ROOT/configs/models.conf" ]]; then
|
||
matched=false
|
||
found_in_catalog=false
|
||
while IFS='|' read -r name repo file size_gb category desc; do
|
||
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
|
||
if [[ "$local_name" == "$file" ]]; then
|
||
found_in_catalog=true
|
||
echo "$CATEGORY_FILTER" | tr ',' '\n' | grep -qF "$category" && matched=true
|
||
break
|
||
fi
|
||
done < "$PROJECT_ROOT/configs/models.conf"
|
||
if $found_in_catalog && ! $matched; then
|
||
log_info "Skipping $local_name (category not in: $CATEGORY_FILTER)"
|
||
continue
|
||
fi
|
||
fi
|
||
|
||
MODEL_PATHS+=("$p")
|
||
done
|
||
|
||
if (( ${#MODEL_PATHS[@]} == 0 )); then
|
||
log_error "No models matched filters. Run: make benchmark-setup"
|
||
exit 1
|
||
fi
|
||
log_info "Models: ${#MODEL_PATHS[@]}"
|
||
|
||
# Start metric logging
|
||
METRICS_FILE="$RESULT_DIR/metrics.csv"
|
||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
|
||
METRICS_PID=$!
|
||
cleanup() {
|
||
kill "$METRICS_PID" 2>/dev/null || true
|
||
wait "$METRICS_PID" 2>/dev/null || true
|
||
}
|
||
trap 'cleanup; exit 0' EXIT
|
||
|
||
# Run benchmarks (same logic as run-baseline.sh)
|
||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
||
|
||
for BACKEND in "${available_backends[@]}"; do
|
||
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
|
||
BACKEND_SAFE="${BACKEND//[.-]/_}"
|
||
|
||
ENV_ARGS=()
|
||
[[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
|
||
|
||
# Resolve model path for toolbox
|
||
TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")"
|
||
if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then
|
||
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
||
fi
|
||
|
||
for KV_SPEC in "${KV_TYPES[@]}"; do
|
||
# Parse KV spec: "q8_0" → K=q8_0,V=q8_0 or "q4_0:q8_0" → K=q4_0,V=q8_0
|
||
if [[ "$KV_SPEC" == *:* ]]; then
|
||
KV_K="${KV_SPEC%%:*}"
|
||
KV_V="${KV_SPEC##*:}"
|
||
else
|
||
KV_K="$KV_SPEC"
|
||
KV_V="$KV_SPEC"
|
||
fi
|
||
|
||
# Build KV cache args (skip for f16 — it's the default)
|
||
KV_ARGS=()
|
||
KV_SUFFIX=""
|
||
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||
fi
|
||
|
||
# Build batch size args
|
||
BATCH_ARGS=()
|
||
BATCH_SUFFIX=""
|
||
if [[ -n "$BATCH_SIZE" ]]; then
|
||
BATCH_ARGS+=(-b "$BATCH_SIZE")
|
||
BATCH_SUFFIX="__b${BATCH_SIZE}"
|
||
fi
|
||
|
||
# Standard test
|
||
local_suffix="fa1"
|
||
[[ "$PP_TOKENS" != "512" || "$TG_TOKENS" != "128" ]] && local_suffix="fa1__pp${PP_TOKENS}_tg${TG_TOKENS}"
|
||
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__${local_suffix}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||
if [[ ! -s "$OUT" ]]; then
|
||
printf "\n${BOLD}>> [%s] %s — pp%s/tg%s KV=%s${RESET}\n" \
|
||
"$BACKEND" "$MODEL_NAME" "$PP_TOKENS" "$TG_TOKENS" "${KV_K}/${KV_V}"
|
||
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||
-p "$PP_TOKENS" -n "$TG_TOKENS" -r "$REPS_STANDARD" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||
if "${CMD[@]}" > "$OUT" 2>&1; then
|
||
log_success "Done"; tail -3 "$OUT"
|
||
else
|
||
log_error "Failed"; echo "FAILED" >> "$OUT"
|
||
fi
|
||
fi
|
||
|
||
# Long-context test
|
||
if $SKIP_LONGCTX; then
|
||
continue
|
||
fi
|
||
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx${CTX_DEPTH}${KV_SUFFIX}${BATCH_SUFFIX}.log"
|
||
if [[ ! -s "$OUT_LC" ]]; then
|
||
printf "\n${BOLD}>> [%s] %s — longctx %s KV=%s${RESET}\n" \
|
||
"$BACKEND" "$MODEL_NAME" "$CTX_DEPTH" "${KV_K}/${KV_V}"
|
||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||
-p "$CTX_PROMPT" -n "$TG_TOKENS" -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||
log_success "Done"; tail -3 "$OUT_LC"
|
||
else
|
||
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
|
||
fi
|
||
fi
|
||
done # KV_TYPES
|
||
done
|
||
done
|
||
|
||
# Parse results
|
||
SUMMARY="$RESULT_DIR/summary.json"
|
||
# Parse llama-bench log files into summary JSON
|
||
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
|
||
import sys, os, re, json
|
||
from pathlib import Path
|
||
|
||
result_dir = Path(sys.argv[1])
|
||
results = []
|
||
|
||
for logfile in sorted(result_dir.glob("*.log")):
|
||
content = logfile.read_text()
|
||
if "FAILED" in content:
|
||
continue
|
||
|
||
# Extract KV cache type from filename (__kv_q8_0-q8_0)
|
||
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
|
||
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||
|
||
for line in content.splitlines():
|
||
line = line.strip()
|
||
if not line.startswith("|") or ("model" in line.lower() and "size" in line.lower()):
|
||
continue
|
||
if "---" in line:
|
||
continue
|
||
parts = [p.strip() for p in line.split("|")]
|
||
# Filter out empty parts from leading/trailing pipes
|
||
data = [p for p in parts if p and "---" not in p]
|
||
if len(data) < 6:
|
||
continue
|
||
try:
|
||
# test and t/s are always the last two columns
|
||
test_type = data[-2]
|
||
ts_raw = data[-1]
|
||
ts_match = re.match(r'([\d.]+)', ts_raw)
|
||
if not ts_match:
|
||
continue
|
||
results.append({
|
||
"file": logfile.name,
|
||
"model": data[0],
|
||
"size": data[1],
|
||
"backend": data[3],
|
||
"test": test_type,
|
||
"tokens_per_sec": float(ts_match.group(1)),
|
||
"kv_cache": kv_type,
|
||
"raw": ts_raw,
|
||
})
|
||
except (ValueError, IndexError):
|
||
continue
|
||
|
||
print(json.dumps({"results": results}, indent=2))
|
||
PYEOF
|
||
|
||
log_header "Results"
|
||
python3 - "$SUMMARY" << 'PYEOF'
|
||
import sys, json
|
||
with open(sys.argv[1]) as f:
|
||
data = json.load(f)
|
||
if not data["results"]:
|
||
print(" No results parsed.")
|
||
sys.exit(0)
|
||
fmt = " {:<20} {:<16} {:<10} {:<8} {:>10}"
|
||
print(fmt.format("Model", "Backend", "KV cache", "Test", "t/s"))
|
||
print(" " + "-" * 68)
|
||
for r in data["results"]:
|
||
print(fmt.format(
|
||
r["model"][:20], r["backend"][:16],
|
||
r.get("kv_cache", "f16/f16")[:10], r["test"],
|
||
f"{r['tokens_per_sec']:.2f}"))
|
||
PYEOF
|
||
|
||
echo ""
|
||
log_success "Results saved to: $RESULT_DIR"
|