Files
Felipe Cardoso 15bb6a8ed9 feat(serve): set APEX I-Compact as default, harden benchmark workflow
Serving:
- make serve now launches Claude-distilled APEX 35B-A3B (16GB) with 2
  parallel slots and 256K context as the daily driver
- add serve-custom for ad-hoc model testing
- add flush-gpu to reclaim unified memory after stuck runs

Benchmarks:
- default Vulkan-only backends (ROCm trails at long context)
- add --backends filter to run-baseline.sh
- fix backend filter substring bug (grep -qFx for exact line match)
- fix model filter regex metacharacter bug (grep -qiF for literal)
- respect --tg in long-context tests instead of hardcoded n=32

ROCm bump to 7.2.1 (kernel 6.18.4+ patch); keep 7.2 as optional.

Catalog:
- add mudler APEX I-Compact (Claude-distilled 35B, 17GB)
- add 0xSero REAP-40 (pruned 122B-A10B, 46GB)
- update download instructions: hf download (huggingface-cli is gone)
2026-04-13 01:11:46 +02:00

134 lines
5.1 KiB
Bash

#!/usr/bin/env bash
# Benchmark setup — ensure toolboxes and test models are ready
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes"
MODEL_DIR="$(data_dir models)"
MODEL_CATALOG="$PROJECT_ROOT/configs/models.conf"
log_header "Benchmark Setup"
# ── 1. Check toolbox containers ──────────────────────────
log_info "Checking toolbox containers..."
REQUIRED_TOOLBOXES=("llama-vulkan-radv" "llama-rocm-7.2.1")
OPTIONAL_TOOLBOXES=("llama-rocm-7.2" "llama-rocm-6.4.4" "llama-vulkan-amdvlk")
existing=$(detect_toolbox_names 2>/dev/null || true)
missing=()
for tb in "${REQUIRED_TOOLBOXES[@]}"; do
if echo "$existing" | grep -qF "$tb"; then
log_success "Toolbox: $tb"
else
missing+=("$tb")
log_warn "Toolbox missing: $tb"
fi
done
for tb in "${OPTIONAL_TOOLBOXES[@]}"; do
if echo "$existing" | grep -qF "$tb"; then
log_success "Toolbox: $tb (optional)"
else
log_info "Toolbox not present: $tb (optional)"
fi
done
if (( ${#missing[@]} > 0 )); then
log_info "Need to create required toolboxes."
if [[ -d "$TOOLBOXES_REPO" ]]; then
log_info "Found toolboxes repo at: $TOOLBOXES_REPO"
if confirm "Create missing toolboxes using refresh-toolboxes.sh?"; then
for tb in "${missing[@]}"; do
log_info "Creating $tb..."
bash "$TOOLBOXES_REPO/refresh-toolboxes.sh" "$tb"
done
fi
else
log_error "Toolboxes repo not found at: $TOOLBOXES_REPO"
log_info "Clone it: git clone https://github.com/kyuz0/amd-strix-halo-toolboxes"
log_info "Then re-run this setup."
exit 1
fi
fi
# ── 2. Verify GPU access inside toolboxes ────────────────
log_info "Verifying GPU access in toolboxes..."
for tb in "${REQUIRED_TOOLBOXES[@]}"; do
if echo "$existing" | grep -qF "$tb"; then
if toolbox run -c "$tb" -- llama-cli --list-devices 2>/dev/null | grep -qi "vulkan\|rocm\|radeon\|available devices"; then
log_success "GPU accessible in $tb"
else
log_warn "GPU may not be accessible in $tb — check device mappings"
fi
fi
done
# ── 3. Check for test models ────────────────────────────
log_info "Checking for test models in $MODEL_DIR..."
model_count=$(find -L "$MODEL_DIR" -name "*.gguf" -not -name "mmproj-*" 2>/dev/null | wc -l)
if (( model_count > 0 )); then
log_success "Found $model_count model(s):"
find -L "$MODEL_DIR" -name "*.gguf" -not -name "mmproj-*" | while read -r f; do
size=$(du -h "$f" | cut -f1)
printf " %s (%s)\n" "$(basename "$f")" "$size"
done
else
log_warn "No GGUF models found in $MODEL_DIR"
fi
# ── 4. Show model catalog ───────────────────────────────
log_header "Model Catalog"
log_info "Available models (from configs/models.conf):"
echo ""
printf " ${BOLD}%-28s %-10s %-8s %s${RESET}\n" "Name" "Category" "Size" "Description"
echo " $(printf '%.0s─' {1..70})"
while IFS='|' read -r name repo file size_gb category desc; do
[[ "$name" =~ ^#.*$ || -z "$name" ]] && continue
status=" "
if find -L "$MODEL_DIR" -name "$file" -print -quit 2>/dev/null | grep -q .; then
status="*"
fi
printf " %s %-27s %-10s %4s GB %s\n" "$status" "$name" "$category" "$size_gb" "$desc"
done < "$MODEL_CATALOG"
echo ""
echo " (* = downloaded)"
echo ""
# ── 5. Offer downloads ──────────────────────────────────
if is_cmd huggingface-cli; then
log_info "Download models with:"
echo ""
echo " # Recommended starter set:"
echo " # Smoke test (3 GB):"
echo " huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf --local-dir $MODEL_DIR"
echo ""
echo " # Top pick — Qwen3.5-35B-A3B MoE Q8 (37 GB, ~85 t/s gen):"
echo " huggingface-cli download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q8_0.gguf --local-dir $MODEL_DIR"
echo ""
echo " # Agentic/coding — Qwen3-Coder-30B-A3B (18 GB, best for tool use):"
echo " huggingface-cli download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf --local-dir $MODEL_DIR"
echo ""
echo " # Or download any model from catalog:"
echo " # huggingface-cli download REPO FILE --local-dir $MODEL_DIR"
echo ""
if (( model_count == 0 )); then
if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as smoke test model?"; then
huggingface-cli download unsloth/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
--local-dir "$MODEL_DIR"
log_success "Model downloaded"
fi
fi
else
log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
fi
log_header "Setup Complete"
log_info "Run 'make benchmark-baseline' to capture your baseline."