fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts)
- Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs
- Add -b/--batch to bin/benchmark help text
- Add --no-think flag to serve script (--reasoning-budget 0)
- Sanitize model names in eval run directories
- Simplify agentic setup to use requirements.txt
- Add serve --help test, batch flag assertions to existing tests
- Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
Felipe Cardoso
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions

View File

@@ -14,6 +14,7 @@ CTX_SIZE=131072
PARALLEL=1
MODEL=""
NGRAM=false
NO_THINK=false
while [[ $# -gt 0 ]]; do
case "$1" in
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
--ctx) CTX_SIZE="$2"; shift 2 ;;
--parallel) PARALLEL="$2"; shift 2 ;;
--ngram) NGRAM=true; shift ;;
--no-think) NO_THINK=true; shift ;;
--help|-h)
echo "Usage: launch.sh [OPTIONS]"
echo ""
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo " --ctx N Context size (default: 131072)"
echo " --parallel N Parallel request slots (default: 1)"
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
echo " --no-think Disable thinking/reasoning (faster for evals)"
echo ""
echo "Presets (pass model filename):"
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
@@ -101,7 +104,7 @@ fi
SERVER_ARGS=(
-ngl 99 # Full GPU offload
--no-mmap # Direct load, no mmap overhead
-fa # Flash attention
-fa on # Flash attention
-m "$TOOLBOX_MODEL_PATH"
-c "$CTX_SIZE" # Context size
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
@@ -110,6 +113,11 @@ SERVER_ARGS=(
-np "$PARALLEL" # Parallel slots
)
# Disable thinking mode (faster for evals)
if $NO_THINK; then
SERVER_ARGS+=(--reasoning-budget 0)
fi
# N-gram speculative decoding
if $NGRAM; then
SERVER_ARGS+=(
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
log_info "Context: $CTX_SIZE tokens"
log_info "KV cache: q4_0/q4_0"
log_info "Parallel slots: $PARALLEL"
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
log_info "Port: $PORT"
log_info "Endpoint: http://localhost:$PORT"