#!/usr/bin/env bash # Launch llama-server with optimized settings for Strix Halo set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" source "$SCRIPT_DIR/../../lib/detect.sh" source "$SCRIPT_DIR/../../lib/format.sh" MODEL_DIR="$(data_dir models)" BACKEND="llama-vulkan-radv" PORT=8080 CTX_SIZE=131072 PARALLEL=1 MODEL="" NGRAM=false while [[ $# -gt 0 ]]; do case "$1" in -m|--model) MODEL="$2"; shift 2 ;; --backend) BACKEND="$2"; shift 2 ;; --port) PORT="$2"; shift 2 ;; --ctx) CTX_SIZE="$2"; shift 2 ;; --parallel) PARALLEL="$2"; shift 2 ;; --ngram) NGRAM=true; shift ;; --help|-h) echo "Usage: launch.sh [OPTIONS]" echo "" echo "Options:" echo " -m, --model FILE GGUF model filename (searches data/models/)" echo " --backend NAME Toolbox backend (default: llama-vulkan-radv)" echo " --port N Listen port (default: 8080)" echo " --ctx N Context size (default: 131072)" echo " --parallel N Parallel request slots (default: 1)" echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)" echo "" echo "Presets (pass model filename):" echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver" echo " Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf Agentic coding" echo " Qwen3-Coder-Next-UD-Q3_K_XL.gguf Complex SE tasks" echo "" echo "Examples:" echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf" echo " launch.sh -m Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf --ngram --ctx 262144" echo " launch.sh -m Qwen3-Coder-30B-A3B-Instruct-UD-Q6_K_XL.gguf --parallel 2" exit 0 ;; *) log_warn "Unknown argument: $1"; shift ;; esac done if [[ -z "$MODEL" ]]; then log_error "No model specified. Use -m MODEL_FILENAME" echo "" echo "Available models:" find -L "$MODEL_DIR" -type f -name '*.gguf' -not -name 'mmproj-*' \ -not -name '*-000*-of-*.gguf' -printf ' %f\n' 2>/dev/null | sort exit 1 fi # Find model file MODEL_PATH="$(find -L "$MODEL_DIR" -type f -name "$MODEL" -print -quit 2>/dev/null)" if [[ -z "$MODEL_PATH" ]]; then log_error "Model not found: $MODEL" exit 1 fi # Resolve for toolbox TOOLBOX_MODEL_PATH="$(realpath "$MODEL_PATH")" if [[ "$TOOLBOX_MODEL_PATH" != /home/* ]]; then TOOLBOX_MODEL_PATH="/run/host${TOOLBOX_MODEL_PATH}" fi # Backend-specific settings declare -A SERVER_PATHS=( [llama-vulkan-radv]="/usr/sbin/llama-server" [llama-vulkan-amdvlk]="/usr/sbin/llama-server" [llama-rocm-6.4.4]="/usr/local/bin/llama-server" [llama-rocm-7.2]="/usr/local/bin/llama-server" [llama-rocm7-nightlies]="/usr/local/bin/llama-server" ) SERVER_BIN="${SERVER_PATHS[$BACKEND]:-}" if [[ -z "$SERVER_BIN" ]]; then log_error "Unknown backend: $BACKEND" exit 1 fi # Check toolbox exists if ! toolbox list 2>/dev/null | grep -q "$BACKEND"; then log_error "Toolbox not found: $BACKEND" exit 1 fi # Build environment args ENV_ARGS=() if [[ "$BACKEND" == *rocm* ]]; then ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) fi # Build server args SERVER_ARGS=( -ngl 99 # Full GPU offload --no-mmap # Direct load, no mmap overhead -fa # Flash attention -m "$TOOLBOX_MODEL_PATH" -c "$CTX_SIZE" # Context size --cache-type-k q4_0 # KV cache quantization (fastest on Vulkan) --cache-type-v q4_0 --port "$PORT" -np "$PARALLEL" # Parallel slots ) # N-gram speculative decoding if $NGRAM; then SERVER_ARGS+=( --spec-type ngram-simple --draft-max 64 --draft-min 4 ) fi # Display config log_header "llama-server" log_info "Model: $(basename "$MODEL_PATH") ($(du -h "$MODEL_PATH" | cut -f1))" log_info "Backend: $BACKEND" log_info "Context: $CTX_SIZE tokens" log_info "KV cache: q4_0/q4_0" log_info "Parallel slots: $PARALLEL" $NGRAM && log_info "N-gram speculative: enabled (draft-max=64)" log_info "Port: $PORT" log_info "Endpoint: http://localhost:$PORT" echo "" log_info "Starting server..." # Launch exec toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$SERVER_BIN" "${SERVER_ARGS[@]}"