fix: address code review findings — batch args, venv path, serve flags
- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
@@ -14,6 +14,7 @@ CTX_SIZE=131072
|
||||
PARALLEL=1
|
||||
MODEL=""
|
||||
NGRAM=false
|
||||
NO_THINK=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
|
||||
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
||||
--parallel) PARALLEL="$2"; shift 2 ;;
|
||||
--ngram) NGRAM=true; shift ;;
|
||||
--no-think) NO_THINK=true; shift ;;
|
||||
--help|-h)
|
||||
echo "Usage: launch.sh [OPTIONS]"
|
||||
echo ""
|
||||
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
|
||||
echo " --ctx N Context size (default: 131072)"
|
||||
echo " --parallel N Parallel request slots (default: 1)"
|
||||
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
||||
echo " --no-think Disable thinking/reasoning (faster for evals)"
|
||||
echo ""
|
||||
echo "Presets (pass model filename):"
|
||||
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
||||
@@ -101,7 +104,7 @@ fi
|
||||
SERVER_ARGS=(
|
||||
-ngl 99 # Full GPU offload
|
||||
--no-mmap # Direct load, no mmap overhead
|
||||
-fa # Flash attention
|
||||
-fa on # Flash attention
|
||||
-m "$TOOLBOX_MODEL_PATH"
|
||||
-c "$CTX_SIZE" # Context size
|
||||
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||
@@ -110,6 +113,11 @@ SERVER_ARGS=(
|
||||
-np "$PARALLEL" # Parallel slots
|
||||
)
|
||||
|
||||
# Disable thinking mode (faster for evals)
|
||||
if $NO_THINK; then
|
||||
SERVER_ARGS+=(--reasoning-budget 0)
|
||||
fi
|
||||
|
||||
# N-gram speculative decoding
|
||||
if $NGRAM; then
|
||||
SERVER_ARGS+=(
|
||||
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
|
||||
log_info "Context: $CTX_SIZE tokens"
|
||||
log_info "KV cache: q4_0/q4_0"
|
||||
log_info "Parallel slots: $PARALLEL"
|
||||
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
|
||||
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
||||
log_info "Port: $PORT"
|
||||
log_info "Endpoint: http://localhost:$PORT"
|
||||
|
||||
Reference in New Issue
Block a user