fix: address code review findings — batch args, venv path, serve flags
- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
|||||||
data/
|
data/
|
||||||
|
.venv/
|
||||||
*.log
|
*.log
|
||||||
*.csv
|
*.csv
|
||||||
*.tmp
|
*.tmp
|
||||||
|
|||||||
14
CLAUDE.md
14
CLAUDE.md
@@ -41,9 +41,21 @@ make verify # 9-point optimization checklist
|
|||||||
bin/audit --json | python3 -m json.tool # Verify JSON output is valid
|
bin/audit --json | python3 -m json.tool # Verify JSON output is valid
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Serving
|
||||||
|
|
||||||
|
`scripts/serve/launch.sh` with dispatcher at `bin/serve`. Launches llama-server inside toolbox containers with optimized defaults: Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload. Key flags:
|
||||||
|
- `--ngram` — n-gram speculative decoding (~1.1-1.4x tg for repetitive content)
|
||||||
|
- `--no-think` — disables thinking/reasoning via `--reasoning-budget 0` (faster for evals)
|
||||||
|
- `--ctx N` — context size (default 131072)
|
||||||
|
- `--parallel N` — concurrent request slots
|
||||||
|
|
||||||
|
## System Tuning
|
||||||
|
|
||||||
|
`scripts/optimize/power-profile.sh` applies Phase 2 optimizations: RyzenAdj PPT increase (85W target, HP caps at 70W sustained), sysctl tuning (vm.swappiness=1, vm.max_map_count=500000), THP=always, RADV_PERFTEST=nogttspill. Systemd services for boot/resume persistence at `configs/ryzenadj-llm.service` and `configs/ryzenadj-resume.service`.
|
||||||
|
|
||||||
## Agentic Evaluation
|
## Agentic Evaluation
|
||||||
|
|
||||||
Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `data/venv/`. Eval frameworks: inspect-ai (all-in-one), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint (ollama or llama.cpp server). Model catalog at `configs/models.conf`.
|
Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `.venv/` (Python 3.13, dependencies in `requirements.txt`). Eval frameworks: inspect-ai (all-in-one), inspect-evals (task definitions), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint — auto-detects llama-server (port 8080) or ollama (port 11434). Model catalog at `configs/models.conf`.
|
||||||
|
|
||||||
## External Resources
|
## External Resources
|
||||||
|
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ case "${1:-help}" in
|
|||||||
echo " --category LIST Comma-separated: smoke,dense,moe"
|
echo " --category LIST Comma-separated: smoke,dense,moe"
|
||||||
echo " --skip-longctx Skip long-context (32K) tests"
|
echo " --skip-longctx Skip long-context (32K) tests"
|
||||||
echo " --reps N Standard test repetitions (default: 5)"
|
echo " --reps N Standard test repetitions (default: 5)"
|
||||||
|
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
|
||||||
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Examples:"
|
echo "Examples:"
|
||||||
|
|||||||
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Agentic evaluation frameworks
|
||||||
|
# Install: python3.13 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt
|
||||||
|
# Requires Python >=3.10, <3.14 (bigcodebench constraint)
|
||||||
|
|
||||||
|
inspect-ai>=0.3.201
|
||||||
|
inspect-evals>=0.6.0
|
||||||
|
evalplus>=0.3.1
|
||||||
|
bigcodebench>=0.2.5
|
||||||
|
openai>=2.26.0
|
||||||
|
|
||||||
|
# IFEval dependency (not on PyPI)
|
||||||
|
instruction_following_eval @ git+https://github.com/josejg/instruction_following_eval
|
||||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
|||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||||
|
|
||||||
VENV_DIR="$(data_dir venv)"
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||||
EVAL_DIR="$(data_dir evals)"
|
EVAL_DIR="$(data_dir evals)"
|
||||||
|
|
||||||
# ── Argument parsing ─────────────────────────────────────
|
# ── Argument parsing ─────────────────────────────────────
|
||||||
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
# ── Validation ───────────────────────────────────────────
|
# ── Validation ───────────────────────────────────────────
|
||||||
if [[ -z "$MODEL" ]]; then
|
|
||||||
log_error "Model name required. Use --model NAME"
|
|
||||||
log_info "Examples:"
|
|
||||||
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
|
||||||
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||||
log_error "Virtual environment not found. Run: make agentic-setup"
|
log_error "Virtual environment not found. Run: make agentic-setup"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
source "$VENV_DIR/bin/activate"
|
source "$VENV_DIR/bin/activate"
|
||||||
|
|
||||||
# Check server is reachable
|
# Auto-detect server if no explicit endpoint given
|
||||||
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
|
||||||
# Try ollama native endpoint
|
if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
|
||||||
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
ENDPOINT="http://localhost:8080/v1"
|
||||||
log_info "Ollama detected, using OpenAI-compat endpoint"
|
log_info "Auto-detected llama-server at localhost:8080"
|
||||||
|
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
||||||
|
log_info "Auto-detected ollama at localhost:11434"
|
||||||
else
|
else
|
||||||
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
|
log_error "No LLM server found. Start one first:"
|
||||||
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||||
|
log_info " ollama serve (ollama)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
|
||||||
|
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
||||||
|
log_error "No LLM server at $ENDPOINT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Auto-detect model name from server if not provided
|
||||||
|
if [[ -z "$MODEL" ]]; then
|
||||||
|
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
models = data.get('data', [])
|
||||||
|
if models:
|
||||||
|
print(models[0].get('id', ''))
|
||||||
|
except: pass
|
||||||
|
" 2>/dev/null || true)
|
||||||
|
if [[ -n "$DETECTED_MODEL" ]]; then
|
||||||
|
MODEL="$DETECTED_MODEL"
|
||||||
|
log_info "Auto-detected model: $MODEL"
|
||||||
|
else
|
||||||
|
log_error "Model name required. Use --model NAME"
|
||||||
|
log_info "Examples:"
|
||||||
|
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
||||||
|
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
TS="$(timestamp)"
|
TS="$(timestamp)"
|
||||||
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
|
SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
|
||||||
|
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
|
||||||
mkdir -p "$RUN_DIR"
|
mkdir -p "$RUN_DIR"
|
||||||
|
|
||||||
log_header "Agentic Evaluation: $SUITE"
|
log_header "Agentic Evaluation: $SUITE"
|
||||||
@@ -86,7 +112,11 @@ ENDJSON
|
|||||||
METRICS_FILE="$RUN_DIR/metrics.csv"
|
METRICS_FILE="$RUN_DIR/metrics.csv"
|
||||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
||||||
METRICS_PID=$!
|
METRICS_PID=$!
|
||||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
|
cleanup() {
|
||||||
|
kill "$METRICS_PID" 2>/dev/null || true
|
||||||
|
wait "$METRICS_PID" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
trap 'cleanup; exit 0' EXIT
|
||||||
|
|
||||||
# ── Suite execution ──────────────────────────────────────
|
# ── Suite execution ──────────────────────────────────────
|
||||||
|
|
||||||
@@ -113,14 +143,14 @@ run_evalplus() {
|
|||||||
run_inspect_eval() {
|
run_inspect_eval() {
|
||||||
local eval_name="$1"
|
local eval_name="$1"
|
||||||
local display_name="$2"
|
local display_name="$2"
|
||||||
|
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
|
||||||
log_info "Running Inspect AI: $display_name..."
|
log_info "Running Inspect AI: $display_name..."
|
||||||
local out="$RUN_DIR/inspect-${eval_name}.json"
|
|
||||||
|
|
||||||
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
||||||
inspect eval "$eval_name" \
|
inspect eval "$eval_name" \
|
||||||
--model "openai/$MODEL" \
|
--model "openai/$MODEL" \
|
||||||
--log-dir "$RUN_DIR/inspect-logs/" \
|
--log-dir "$RUN_DIR/inspect-logs/" \
|
||||||
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
|
2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
|
||||||
|
|
||||||
log_success "Inspect $display_name complete"
|
log_success "Inspect $display_name complete"
|
||||||
}
|
}
|
||||||
@@ -138,7 +168,7 @@ run_bigcodebench() {
|
|||||||
case "$SUITE" in
|
case "$SUITE" in
|
||||||
quick)
|
quick)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||||
;;
|
;;
|
||||||
code)
|
code)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
@@ -146,13 +176,13 @@ case "$SUITE" in
|
|||||||
run_bigcodebench
|
run_bigcodebench
|
||||||
;;
|
;;
|
||||||
tooluse)
|
tooluse)
|
||||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||||
;;
|
;;
|
||||||
full)
|
full)
|
||||||
run_evalplus "humaneval"
|
run_evalplus "humaneval"
|
||||||
run_evalplus "mbpp"
|
run_evalplus "mbpp"
|
||||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||||
run_bigcodebench
|
run_bigcodebench
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
|
|||||||
log_header "Agentic Evaluation Setup"
|
log_header "Agentic Evaluation Setup"
|
||||||
|
|
||||||
# ── Python virtual environment ───────────────────────────
|
# ── Python virtual environment ───────────────────────────
|
||||||
VENV_DIR="$(data_dir venv)"
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||||
|
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
|
||||||
|
|
||||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||||
log_info "Creating Python virtual environment..."
|
# Prefer Python 3.13 (bigcodebench requires <3.14)
|
||||||
python3 -m venv "$VENV_DIR"
|
PYTHON_BIN="python3.13"
|
||||||
|
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
||||||
|
PYTHON_BIN="python3"
|
||||||
|
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
|
||||||
|
fi
|
||||||
|
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
|
||||||
|
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
||||||
log_success "Virtual environment created at $VENV_DIR"
|
log_success "Virtual environment created at $VENV_DIR"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
source "$VENV_DIR/bin/activate"
|
source "$VENV_DIR/bin/activate"
|
||||||
log_info "Python: $(python3 --version) from $VENV_DIR"
|
log_info "Python: $(python3 --version) from $VENV_DIR"
|
||||||
|
|
||||||
# ── Install evaluation frameworks ────────────────────────
|
# ── Install from requirements.txt ────────────────────────
|
||||||
|
log_info "Installing dependencies from requirements.txt..."
|
||||||
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
|
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
|
||||||
if python3 -c "import inspect_ai" 2>/dev/null; then
|
log_success "Dependencies installed"
|
||||||
log_success "inspect-ai already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing inspect-ai (main eval framework)..."
|
|
||||||
pip install inspect-ai 2>&1 | tail -3
|
|
||||||
log_success "inspect-ai installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
|
|
||||||
if python3 -c "import evalplus" 2>/dev/null; then
|
|
||||||
log_success "evalplus already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing evalplus (code generation benchmarks)..."
|
|
||||||
pip install evalplus 2>&1 | tail -3
|
|
||||||
log_success "evalplus installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# BigCodeBench
|
|
||||||
if python3 -c "import bigcodebench" 2>/dev/null; then
|
|
||||||
log_success "bigcodebench already installed"
|
|
||||||
else
|
|
||||||
log_info "Installing bigcodebench..."
|
|
||||||
pip install bigcodebench 2>&1 | tail -3
|
|
||||||
log_success "bigcodebench installed"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ── Check for local LLM server ──────────────────────────
|
# ── Check for local LLM server ──────────────────────────
|
||||||
log_header "LLM Server Check"
|
log_header "LLM Server Check"
|
||||||
|
|
||||||
ollama_ok=false
|
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
|
||||||
llamacpp_ok=false
|
log_success "llama-server running at localhost:8080"
|
||||||
|
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||||
if is_cmd ollama; then
|
log_success "ollama running at localhost:11434"
|
||||||
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
|
|
||||||
log_success "ollama running at localhost:11434"
|
|
||||||
ollama_ok=true
|
|
||||||
# List available models
|
|
||||||
log_info "Available ollama models:"
|
|
||||||
ollama list 2>/dev/null | head -10 || true
|
|
||||||
else
|
|
||||||
log_warn "ollama installed but not running. Start with: ollama serve"
|
|
||||||
fi
|
|
||||||
else
|
else
|
||||||
log_info "ollama not installed — needed for most agentic benchmarks"
|
log_warn "No local LLM server running. Start one before running evals:"
|
||||||
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||||
fi
|
log_info " ollama serve (ollama)"
|
||||||
|
|
||||||
# Check for llama.cpp server
|
|
||||||
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
|
|
||||||
log_success "llama.cpp server running at localhost:8080"
|
|
||||||
llamacpp_ok=true
|
|
||||||
else
|
|
||||||
log_info "No llama.cpp server detected at localhost:8080"
|
|
||||||
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! $ollama_ok && ! $llamacpp_ok; then
|
|
||||||
log_warn "No local LLM server running. Agentic benchmarks need one."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ── Summary ──────────────────────────────────────────────
|
# ── Summary ──────────────────────────────────────────────
|
||||||
log_header "Setup Complete"
|
log_header "Setup Complete"
|
||||||
echo ""
|
echo ""
|
||||||
echo " Installed tools:"
|
echo " Installed tools:"
|
||||||
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
|
echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
|
||||||
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
echo " inspect-evals — Task definitions for inspect-ai"
|
||||||
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
||||||
|
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
||||||
echo ""
|
echo ""
|
||||||
echo " To activate the virtual environment:"
|
echo " Activate venv: source .venv/bin/activate"
|
||||||
echo " source data/venv/bin/activate"
|
|
||||||
echo ""
|
echo ""
|
||||||
echo " Run evaluations:"
|
echo " Run evaluations:"
|
||||||
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
|
echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
|
||||||
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
|
echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
|
||||||
|
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
|
||||||
|
echo " make agentic-full # All of the above (~5-6 hours)"
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
@@ -270,7 +270,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||||
-r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
-r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
|
|
||||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
|
|||||||
@@ -252,7 +252,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||||
log_success "Done"; tail -3 "$OUT_LC"
|
log_success "Done"; tail -3 "$OUT_LC"
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ CTX_SIZE=131072
|
|||||||
PARALLEL=1
|
PARALLEL=1
|
||||||
MODEL=""
|
MODEL=""
|
||||||
NGRAM=false
|
NGRAM=false
|
||||||
|
NO_THINK=false
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
||||||
--parallel) PARALLEL="$2"; shift 2 ;;
|
--parallel) PARALLEL="$2"; shift 2 ;;
|
||||||
--ngram) NGRAM=true; shift ;;
|
--ngram) NGRAM=true; shift ;;
|
||||||
|
--no-think) NO_THINK=true; shift ;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
echo "Usage: launch.sh [OPTIONS]"
|
echo "Usage: launch.sh [OPTIONS]"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
echo " --ctx N Context size (default: 131072)"
|
echo " --ctx N Context size (default: 131072)"
|
||||||
echo " --parallel N Parallel request slots (default: 1)"
|
echo " --parallel N Parallel request slots (default: 1)"
|
||||||
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
||||||
|
echo " --no-think Disable thinking/reasoning (faster for evals)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Presets (pass model filename):"
|
echo "Presets (pass model filename):"
|
||||||
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
||||||
@@ -101,7 +104,7 @@ fi
|
|||||||
SERVER_ARGS=(
|
SERVER_ARGS=(
|
||||||
-ngl 99 # Full GPU offload
|
-ngl 99 # Full GPU offload
|
||||||
--no-mmap # Direct load, no mmap overhead
|
--no-mmap # Direct load, no mmap overhead
|
||||||
-fa # Flash attention
|
-fa on # Flash attention
|
||||||
-m "$TOOLBOX_MODEL_PATH"
|
-m "$TOOLBOX_MODEL_PATH"
|
||||||
-c "$CTX_SIZE" # Context size
|
-c "$CTX_SIZE" # Context size
|
||||||
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||||
@@ -110,6 +113,11 @@ SERVER_ARGS=(
|
|||||||
-np "$PARALLEL" # Parallel slots
|
-np "$PARALLEL" # Parallel slots
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Disable thinking mode (faster for evals)
|
||||||
|
if $NO_THINK; then
|
||||||
|
SERVER_ARGS+=(--reasoning-budget 0)
|
||||||
|
fi
|
||||||
|
|
||||||
# N-gram speculative decoding
|
# N-gram speculative decoding
|
||||||
if $NGRAM; then
|
if $NGRAM; then
|
||||||
SERVER_ARGS+=(
|
SERVER_ARGS+=(
|
||||||
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
|
|||||||
log_info "Context: $CTX_SIZE tokens"
|
log_info "Context: $CTX_SIZE tokens"
|
||||||
log_info "KV cache: q4_0/q4_0"
|
log_info "KV cache: q4_0/q4_0"
|
||||||
log_info "Parallel slots: $PARALLEL"
|
log_info "Parallel slots: $PARALLEL"
|
||||||
|
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
|
||||||
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
||||||
log_info "Port: $PORT"
|
log_info "Port: $PORT"
|
||||||
log_info "Endpoint: http://localhost:$PORT"
|
log_info "Endpoint: http://localhost:$PORT"
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--category"
|
assert_output --partial "--category"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "run-suite --help shows usage and exits 0" {
|
@test "run-suite --help shows usage and exits 0" {
|
||||||
@@ -22,6 +23,7 @@ load test_helper.sh
|
|||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--tag"
|
assert_output --partial "--tag"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher shows help with no args" {
|
@test "benchmark dispatcher shows help with no args" {
|
||||||
@@ -31,6 +33,18 @@ load test_helper.sh
|
|||||||
assert_output --partial "--max-size"
|
assert_output --partial "--max-size"
|
||||||
assert_output --partial "--skip-longctx"
|
assert_output --partial "--skip-longctx"
|
||||||
assert_output --partial "--kv-types"
|
assert_output --partial "--kv-types"
|
||||||
|
assert_output --partial "--batch"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "serve --help shows usage and exits 0" {
|
||||||
|
run bash "$PROJECT_ROOT/bin/serve" --help
|
||||||
|
assert_success
|
||||||
|
assert_output --partial "Usage"
|
||||||
|
assert_output --partial "--model"
|
||||||
|
assert_output --partial "--ngram"
|
||||||
|
assert_output --partial "--no-think"
|
||||||
|
assert_output --partial "--ctx"
|
||||||
|
assert_output --partial "--port"
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "benchmark dispatcher passes --help through to baseline" {
|
@test "benchmark dispatcher passes --help through to baseline" {
|
||||||
|
|||||||
Reference in New Issue
Block a user