feat(serve): add optimized llama-server launcher with n-gram speculation

Add `make serve` and `make serve-ngram` for launching llama-server with baked-in optimal settings (Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload). N-gram speculative decoding gives 1.1-1.4x tg speedup on repetitive content without upstream PR dependencies. Update Phase 5 status: MTP is months away (4 unmerged PRs, no MoE support), draft-model speculation stalled on ROCm buffer crashes.
2026-03-30 21:12:30 +02:00
parent ba24091791
commit dd403a907c
4 changed files with 169 additions and 5 deletions
--- a/7
+++ b/7
@@ -38,6 +38,13 @@ benchmark: ## Run full benchmark suite (supports ARGS="--tag NAME --max-size 20"
 benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare BEFORE=dir AFTER=dir)
 	@bash bin/benchmark compare $(BEFORE) $(AFTER)

+# --- Serve ---
+serve: ## Launch llama-server with optimized settings (ARGS="-m MODEL.gguf")
+	@bash bin/serve $(ARGS)
+
+serve-ngram: ## Launch with n-gram speculative decoding (ARGS="-m MODEL.gguf")
+	@bash bin/serve --ngram $(ARGS)
+
 # --- Optimize ---
 optimize: ## Interactive optimization walkthrough
 	@bash bin/optimize --all
--- a/bin/serve
+++ b/bin/serve
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Server dispatcher
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+exec bash "$SCRIPT_DIR/scripts/serve/launch.sh" "$@"
--- a/docs/optimization-log.md
+++ b/docs/optimization-log.md
@@ -135,16 +135,32 @@ Living document tracking what was applied, tested, and the actual results. Each

 ## Phase 5: Future / Blocked

-### 5.1 Speculative Decoding
+### 5.1 Speculative Decoding (draft model)

- **Status**: BLOCKED — llama.cpp PR #20075 (hybrid SSM/MoE fix)
+- **Status**: BLOCKED — llama.cpp PR #20075 (hybrid SSM/MoE checkpoint/restore)
 - **Draft model**: Downloaded `Qwen3.5-0.8B-Q8_0.gguf` (812 MB) on 2026-03-27
- **Last checked**: 2026-03-27 — PR open since 2026-03-03, has ROCm buffer issues
+- **Last checked**: 2026-03-30 — PR stalled since Mar 5. ROCm buffer crashes in `copy_cell()`. Works on Metal/CUDA but not AMD. Months away from landing.

 ### 5.2 Native MTP (Multi-Token Prediction)

- **Status**: BLOCKED — llama.cpp PR #20700
- **Last checked**: 2026-03-27 — WIP, not expected to merge soon
+- **Status**: BLOCKED — multiple dependencies unmerged
+- **Last checked**: 2026-03-30
+- **Details**: 4 separate PRs in flight, none merged:
+  - PR #18886: MTP API framework (DRAFT since Feb 6) — foundation for all MTP work
+  - PR #20700: MTP for Qwen3.5 **dense only** (WIP, author says "not expected to merge soon")
+  - PR #15225: GLM-style MTP (open since Aug 2025, "slower than baseline")
+  - PR #18039: EAGLE3 speculative (open since Dec 2025)
+- **Key gap**: No MTP implementation exists for MoE models. PR #20700 only covers dense Qwen3.5 (0.8B-27B), not the 35B-A3B MoE.
+- **Timeline estimate**: MTP API (#18886) must merge first, then model-specific implementations adapted. Months, not weeks.
+
+### 5.2a N-gram Speculative Decoding (AVAILABLE NOW)
+
+- **Status**: WORKS TODAY — no upstream PRs needed
+- **How**: `llama-server --spec-type ngram-simple --draft-max 64 --draft-min 4`
+- **Expected**: 1.1-1.4x tg speedup on repetitive content (code, structured output)
+- **Added to**: `make serve-ngram ARGS="-m MODEL.gguf"` and `bin/serve --ngram`
+- **Notes**: Pattern-matches from token history, no draft model needed. Best for code generation where patterns repeat. No quality impact.
+- **Verdict**: AVAILABLE — use `--ngram` flag when serving

 ### 5.3 GPU Clock Reporting

--- a/scripts/serve/launch.sh
+++ b/scripts/serve/launch.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+# Launch llama-server with optimized settings for Strix Halo
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/../../lib/common.sh"
+source "$SCRIPT_DIR/../../lib/detect.sh"
+source "$SCRIPT_DIR/../../lib/format.sh"
+
+MODEL_DIR="$(data_dir models)"
+BACKEND="llama-vulkan-radv"
+PORT=8080
+CTX_SIZE=131072
+PARALLEL=1
+MODEL=""
+NGRAM=false
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -m|--model)     MODEL="$2"; shift 2 ;;
+        --backend)      BACKEND="$2"; shift 2 ;;
+        --port)         PORT="$2"; shift 2 ;;
+        --ctx)          CTX_SIZE="$2"; shift 2 ;;
+        --parallel)     PARALLEL="$2"; shift 2 ;;
+        --ngram)        NGRAM=true; shift ;;
+        --help|-h)
+            echo "Usage: launch.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  -m, --model FILE     GGUF model filename (searches data/models/)"
+            echo "  --backend NAME       Toolbox backend (default: llama-vulkan-radv)"
+            echo "  --port N             Listen port (default: 8080)"
+            echo "  --ctx N              Context size (default: 131072)"
+            echo "  --parallel N         Parallel request slots (default: 1)"
+            echo "  --ngram              Enable n-gram speculative decoding (~1.1-1.4x tg)"
+            echo ""
+            echo "Presets (pass model filename):"
+            echo "  Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf     General purpose daily driver"
+            echo "  Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf  Agentic coding"
+            echo "  Qwen3-Coder-Next-UD-Q3_K_XL.gguf     Complex SE tasks"
+            echo ""
+            echo "Examples:"
+            echo "  launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
+            echo "  launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf --ngram --ctx 262144"
+            echo "  launch.sh -m Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf --parallel 2"
+            exit 0 ;;
+        *) log_warn "Unknown argument: $1"; shift ;;
+    esac
+done
+
+if [[ -z "$MODEL" ]]; then
+    log_error "No model specified. Use -m MODEL_FILENAME"
+    echo ""
+    echo "Available models:"
+    find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
+        -not -name '*-000*-of-*.gguf' -printf '  %f\n' 2>/dev/null | sort
+    exit 1
+fi
+
+# Find model file
+MODEL_PATH="$(find -L "$MODEL_DIR" -type f -name "$MODEL" -print -quit 2>/dev/null)"
+if [[ -z "$MODEL_PATH" ]]; then
+    log_error "Model not found: $MODEL"
+    exit 1
+fi
+
+# Resolve for toolbox
+TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")"
+if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then
+    TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
+fi
+
+# Backend-specific settings
+declare -A SERVER_PATHS=(
+    [llama-vulkan-radv]="/usr/sbin/llama-server"
+    [llama-vulkan-amdvlk]="/usr/sbin/llama-server"
+    [llama-rocm-6.4.4]="/usr/local/bin/llama-server"
+    [llama-rocm-7.2]="/usr/local/bin/llama-server"
+    [llama-rocm7-nightlies]="/usr/local/bin/llama-server"
+)
+
+SERVER_BIN="${SERVER_PATHS[$BACKEND]:-}"
+if [[ -z "$SERVER_BIN" ]]; then
+    log_error "Unknown backend: $BACKEND"
+    exit 1
+fi
+
+# Check toolbox exists
+if ! toolbox list 2>/dev/null | grep -q "$BACKEND"; then
+    log_error "Toolbox not found: $BACKEND"
+    exit 1
+fi
+
+# Build environment args
+ENV_ARGS=()
+if [[ "$BACKEND" == *rocm* ]]; then
+    ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
+fi
+
+# Build server args
+SERVER_ARGS=(
+    -ngl 99                          # Full GPU offload
+    --no-mmap                        # Direct load, no mmap overhead
+    -fa                              # Flash attention
+    -m "$TOOLBOX_MODEL_PATH"
+    -c "$CTX_SIZE"                   # Context size
+    --cache-type-k q4_0              # KV cache quantization (fastest on Vulkan)
+    --cache-type-v q4_0
+    --port "$PORT"
+    -np "$PARALLEL"                  # Parallel slots
+)
+
+# N-gram speculative decoding
+if $NGRAM; then
+    SERVER_ARGS+=(
+        --spec-type ngram-simple
+        --draft-max 64
+        --draft-min 4
+    )
+fi
+
+# Display config
+log_header "llama-server"
+log_info "Model: $(basename "$MODEL_PATH") ($(du -h "$MODEL_PATH" | cut -f1))"
+log_info "Backend: $BACKEND"
+log_info "Context: $CTX_SIZE tokens"
+log_info "KV cache: q4_0/q4_0"
+log_info "Parallel slots: $PARALLEL"
+$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
+log_info "Port: $PORT"
+log_info "Endpoint: http://localhost:$PORT"
+echo ""
+log_info "Starting server..."
+
+# Launch
+exec toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$SERVER_BIN" "${SERVER_ARGS[@]}"