Add `make serve` and `make serve-ngram` for launching llama-server with baked-in optimal settings (Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload). N-gram speculative decoding gives 1.1-1.4x tg speedup on repetitive content without upstream PR dependencies. Update Phase 5 status: MTP is months away (4 unmerged PRs, no MoE support), draft-model speculation stalled on ROCm buffer crashes.
137 lines
4.4 KiB
Bash
Executable File
137 lines
4.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Launch llama-server with optimized settings for Strix Halo
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
|
source "$SCRIPT_DIR/../../lib/detect.sh"
|
|
source "$SCRIPT_DIR/../../lib/format.sh"
|
|
|
|
MODEL_DIR="$(data_dir models)"
|
|
BACKEND="llama-vulkan-radv"
|
|
PORT=8080
|
|
CTX_SIZE=131072
|
|
PARALLEL=1
|
|
MODEL=""
|
|
NGRAM=false
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
-m|--model) MODEL="$2"; shift 2 ;;
|
|
--backend) BACKEND="$2"; shift 2 ;;
|
|
--port) PORT="$2"; shift 2 ;;
|
|
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
|
--parallel) PARALLEL="$2"; shift 2 ;;
|
|
--ngram) NGRAM=true; shift ;;
|
|
--help|-h)
|
|
echo "Usage: launch.sh [OPTIONS]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " -m, --model FILE GGUF model filename (searches data/models/)"
|
|
echo " --backend NAME Toolbox backend (default: llama-vulkan-radv)"
|
|
echo " --port N Listen port (default: 8080)"
|
|
echo " --ctx N Context size (default: 131072)"
|
|
echo " --parallel N Parallel request slots (default: 1)"
|
|
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
|
echo ""
|
|
echo "Presets (pass model filename):"
|
|
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
|
echo " Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf Agentic coding"
|
|
echo " Qwen3-Coder-Next-UD-Q3_K_XL.gguf Complex SE tasks"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
|
|
echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf --ngram --ctx 262144"
|
|
echo " launch.sh -m Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf --parallel 2"
|
|
exit 0 ;;
|
|
*) log_warn "Unknown argument: $1"; shift ;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "$MODEL" ]]; then
|
|
log_error "No model specified. Use -m MODEL_FILENAME"
|
|
echo ""
|
|
echo "Available models:"
|
|
find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \
|
|
-not -name '*-000*-of-*.gguf' -printf ' %f\n' 2>/dev/null | sort
|
|
exit 1
|
|
fi
|
|
|
|
# Find model file
|
|
MODEL_PATH="$(find -L "$MODEL_DIR" -type f -name "$MODEL" -print -quit 2>/dev/null)"
|
|
if [[ -z "$MODEL_PATH" ]]; then
|
|
log_error "Model not found: $MODEL"
|
|
exit 1
|
|
fi
|
|
|
|
# Resolve for toolbox
|
|
TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")"
|
|
if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then
|
|
TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}"
|
|
fi
|
|
|
|
# Backend-specific settings
|
|
declare -A SERVER_PATHS=(
|
|
[llama-vulkan-radv]="/usr/sbin/llama-server"
|
|
[llama-vulkan-amdvlk]="/usr/sbin/llama-server"
|
|
[llama-rocm-6.4.4]="/usr/local/bin/llama-server"
|
|
[llama-rocm-7.2]="/usr/local/bin/llama-server"
|
|
[llama-rocm7-nightlies]="/usr/local/bin/llama-server"
|
|
)
|
|
|
|
SERVER_BIN="${SERVER_PATHS[$BACKEND]:-}"
|
|
if [[ -z "$SERVER_BIN" ]]; then
|
|
log_error "Unknown backend: $BACKEND"
|
|
exit 1
|
|
fi
|
|
|
|
# Check toolbox exists
|
|
if ! toolbox list 2>/dev/null | grep -q "$BACKEND"; then
|
|
log_error "Toolbox not found: $BACKEND"
|
|
exit 1
|
|
fi
|
|
|
|
# Build environment args
|
|
ENV_ARGS=()
|
|
if [[ "$BACKEND" == *rocm* ]]; then
|
|
ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
|
|
fi
|
|
|
|
# Build server args
|
|
SERVER_ARGS=(
|
|
-ngl 99 # Full GPU offload
|
|
--no-mmap # Direct load, no mmap overhead
|
|
-fa # Flash attention
|
|
-m "$TOOLBOX_MODEL_PATH"
|
|
-c "$CTX_SIZE" # Context size
|
|
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
|
--cache-type-v q4_0
|
|
--port "$PORT"
|
|
-np "$PARALLEL" # Parallel slots
|
|
)
|
|
|
|
# N-gram speculative decoding
|
|
if $NGRAM; then
|
|
SERVER_ARGS+=(
|
|
--spec-type ngram-simple
|
|
--draft-max 64
|
|
--draft-min 4
|
|
)
|
|
fi
|
|
|
|
# Display config
|
|
log_header "llama-server"
|
|
log_info "Model: $(basename "$MODEL_PATH") ($(du -h "$MODEL_PATH" | cut -f1))"
|
|
log_info "Backend: $BACKEND"
|
|
log_info "Context: $CTX_SIZE tokens"
|
|
log_info "KV cache: q4_0/q4_0"
|
|
log_info "Parallel slots: $PARALLEL"
|
|
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
|
log_info "Port: $PORT"
|
|
log_info "Endpoint: http://localhost:$PORT"
|
|
echo ""
|
|
log_info "Starting server..."
|
|
|
|
# Launch
|
|
exec toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$SERVER_BIN" "${SERVER_ARGS[@]}"
|