diff --git a/Makefile b/Makefile index 629119a..bc8bfad 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,13 @@ benchmark: ## Run full benchmark suite (supports ARGS="--tag NAME --max-size 20" benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare BEFORE=dir AFTER=dir) @bash bin/benchmark compare $(BEFORE) $(AFTER) +# --- Serve --- +serve: ## Launch llama-server with optimized settings (ARGS="-m MODEL.gguf") + @bash bin/serve $(ARGS) + +serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf") + @bash bin/serve --ngram $(ARGS) + # --- Optimize --- optimize: ## Interactive optimization walkthrough @bash bin/optimize --all diff --git a/bin/serve b/bin/serve new file mode 100755 index 0000000..ca5b789 --- /dev/null +++ b/bin/serve @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Server dispatcher +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +exec bash "$SCRIPT_DIR/scripts/serve/launch.sh" "$@" diff --git a/docs/optimization-log.md b/docs/optimization-log.md index 1dfca28..8e1d68b 100644 --- a/docs/optimization-log.md +++ b/docs/optimization-log.md @@ -135,16 +135,32 @@ Living document tracking what was applied, tested, and the actual results. Each ## Phase 5: Future / Blocked -### 5.1 Speculative Decoding +### 5.1 Speculative Decoding (draft model) -- **Status**: BLOCKED — llama.cpp PR #20075 (hybrid SSM/MoE fix) +- **Status**: BLOCKED — llama.cpp PR #20075 (hybrid SSM/MoE checkpoint/restore) - **Draft model**: Downloaded `Qwen3.5-0.8B-Q8_0.gguf` (812 MB) on 2026-03-27 -- **Last checked**: 2026-03-27 — PR open since 2026-03-03, has ROCm buffer issues +- **Last checked**: 2026-03-30 — PR stalled since Mar 5. ROCm buffer crashes in `copy_cell()`. Works on Metal/CUDA but not AMD. Months away from landing. ### 5.2 Native MTP (Multi-Token Prediction) -- **Status**: BLOCKED — llama.cpp PR #20700 -- **Last checked**: 2026-03-27 — WIP, not expected to merge soon +- **Status**: BLOCKED — multiple dependencies unmerged +- **Last checked**: 2026-03-30 +- **Details**: 4 separate PRs in flight, none merged: + - PR #18886: MTP API framework (DRAFT since Feb 6) — foundation for all MTP work + - PR #20700: MTP for Qwen3.5 **dense only** (WIP, author says "not expected to merge soon") + - PR #15225: GLM-style MTP (open since Aug 2025, "slower than baseline") + - PR #18039: EAGLE3 speculative (open since Dec 2025) +- **Key gap**: No MTP implementation exists for MoE models. PR #20700 only covers dense Qwen3.5 (0.8B-27B), not the 35B-A3B MoE. +- **Timeline estimate**: MTP API (#18886) must merge first, then model-specific implementations adapted. Months, not weeks. + +### 5.2a N-gram Speculative Decoding (AVAILABLE NOW) + +- **Status**: WORKS TODAY — no upstream PRs needed +- **How**: `llama-server --spec-type ngram-simple --draft-max 64 --draft-min 4` +- **Expected**: 1.1-1.4x tg speedup on repetitive content (code, structured output) +- **Added to**: `make serve-ngram ARGS="-m MODEL.gguf"` and `bin/serve --ngram` +- **Notes**: Pattern-matches from token history, no draft model needed. Best for code generation where patterns repeat. No quality impact. +- **Verdict**: AVAILABLE — use `--ngram` flag when serving ### 5.3 GPU Clock Reporting diff --git a/scripts/serve/launch.sh b/scripts/serve/launch.sh new file mode 100755 index 0000000..b3513ff --- /dev/null +++ b/scripts/serve/launch.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Launch llama-server with optimized settings for Strix Halo +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +MODEL_DIR="$(data_dir models)" +BACKEND="llama-vulkan-radv" +PORT=8080 +CTX_SIZE=131072 +PARALLEL=1 +MODEL="" +NGRAM=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--model) MODEL="$2"; shift 2 ;; + --backend) BACKEND="$2"; shift 2 ;; + --port) PORT="$2"; shift 2 ;; + --ctx) CTX_SIZE="$2"; shift 2 ;; + --parallel) PARALLEL="$2"; shift 2 ;; + --ngram) NGRAM=true; shift ;; + --help|-h) + echo "Usage: launch.sh [OPTIONS]" + echo "" + echo "Options:" + echo " -m, --model FILE GGUF model filename (searches data/models/)" + echo " --backend NAME Toolbox backend (default: llama-vulkan-radv)" + echo " --port N Listen port (default: 8080)" + echo " --ctx N Context size (default: 131072)" + echo " --parallel N Parallel request slots (default: 1)" + echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)" + echo "" + echo "Presets (pass model filename):" + echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver" + echo " Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf Agentic coding" + echo " Qwen3-Coder-Next-UD-Q3_K_XL.gguf Complex SE tasks" + echo "" + echo "Examples:" + echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf" + echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf --ngram --ctx 262144" + echo " launch.sh -m Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf --parallel 2" + exit 0 ;; + *) log_warn "Unknown argument: $1"; shift ;; + esac +done + +if [[ -z "$MODEL" ]]; then + log_error "No model specified. Use -m MODEL_FILENAME" + echo "" + echo "Available models:" + find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ + -not -name '*-000*-of-*.gguf' -printf ' %f\n' 2>/dev/null | sort + exit 1 +fi + +# Find model file +MODEL_PATH="$(find -L "$MODEL_DIR" -type f -name "$MODEL" -print -quit 2>/dev/null)" +if [[ -z "$MODEL_PATH" ]]; then + log_error "Model not found: $MODEL" + exit 1 +fi + +# Resolve for toolbox +TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")" +if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then + TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" +fi + +# Backend-specific settings +declare -A SERVER_PATHS=( + [llama-vulkan-radv]="/usr/sbin/llama-server" + [llama-vulkan-amdvlk]="/usr/sbin/llama-server" + [llama-rocm-6.4.4]="/usr/local/bin/llama-server" + [llama-rocm-7.2]="/usr/local/bin/llama-server" + [llama-rocm7-nightlies]="/usr/local/bin/llama-server" +) + +SERVER_BIN="${SERVER_PATHS[$BACKEND]:-}" +if [[ -z "$SERVER_BIN" ]]; then + log_error "Unknown backend: $BACKEND" + exit 1 +fi + +# Check toolbox exists +if ! toolbox list 2>/dev/null | grep -q "$BACKEND"; then + log_error "Toolbox not found: $BACKEND" + exit 1 +fi + +# Build environment args +ENV_ARGS=() +if [[ "$BACKEND" == *rocm* ]]; then + ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) +fi + +# Build server args +SERVER_ARGS=( + -ngl 99 # Full GPU offload + --no-mmap # Direct load, no mmap overhead + -fa # Flash attention + -m "$TOOLBOX_MODEL_PATH" + -c "$CTX_SIZE" # Context size + --cache-type-k q4_0 # KV cache quantization (fastest on Vulkan) + --cache-type-v q4_0 + --port "$PORT" + -np "$PARALLEL" # Parallel slots +) + +# N-gram speculative decoding +if $NGRAM; then + SERVER_ARGS+=( + --spec-type ngram-simple + --draft-max 64 + --draft-min 4 + ) +fi + +# Display config +log_header "llama-server" +log_info "Model: $(basename "$MODEL_PATH") ($(du -h "$MODEL_PATH" | cut -f1))" +log_info "Backend: $BACKEND" +log_info "Context: $CTX_SIZE tokens" +log_info "KV cache: q4_0/q4_0" +log_info "Parallel slots: $PARALLEL" +$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)" +log_info "Port: $PORT" +log_info "Endpoint: http://localhost:$PORT" +echo "" +log_info "Starting server..." + +# Launch +exec toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$SERVER_BIN" "${SERVER_ARGS[@]}"