fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions
--- a/scripts/serve/launch.sh
+++ b/scripts/serve/launch.sh
@@ -14,6 +14,7 @@ CTX_SIZE=131072
 PARALLEL=1
 MODEL=""
 NGRAM=false
+NO_THINK=false

 while [[ $# -gt 0 ]]; do
    case "$1" in
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
        --ctx)          CTX_SIZE="$2"; shift 2 ;;
        --parallel)     PARALLEL="$2"; shift 2 ;;
        --ngram)        NGRAM=true; shift ;;
+        --no-think)     NO_THINK=true; shift ;;
        --help|-h)
            echo "Usage: launch.sh [OPTIONS]"
            echo ""
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
            echo "  --ctx N              Context size (default: 131072)"
            echo "  --parallel N         Parallel request slots (default: 1)"
            echo "  --ngram              Enable n-gram speculative decoding (~1.1-1.4x tg)"
+            echo "  --no-think           Disable thinking/reasoning (faster for evals)"
            echo ""
            echo "Presets (pass model filename):"
            echo "  Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf     General purpose daily driver"
@@ -101,7 +104,7 @@ fi
 SERVER_ARGS=(
    -ngl 99                          # Full GPU offload
    --no-mmap                        # Direct load, no mmap overhead
-    -fa                              # Flash attention
+    -fa on                            # Flash attention
    -m "$TOOLBOX_MODEL_PATH"
    -c "$CTX_SIZE"                   # Context size
    --cache-type-k q4_0              # KV cache quantization (fastest on Vulkan)
@@ -110,6 +113,11 @@ SERVER_ARGS=(
    -np "$PARALLEL"                  # Parallel slots
 )

+# Disable thinking mode (faster for evals)
+if $NO_THINK; then
+    SERVER_ARGS+=(--reasoning-budget 0)
+fi
+
 # N-gram speculative decoding
 if $NGRAM; then
    SERVER_ARGS+=(
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
 log_info "Context: $CTX_SIZE tokens"
 log_info "KV cache: q4_0/q4_0"
 log_info "Parallel slots: $PARALLEL"
+$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
 $NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
 log_info "Port: $PORT"
 log_info "Endpoint: http://localhost:$PORT"