fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts)
- Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs
- Add -b/--batch to bin/benchmark help text
- Add --no-think flag to serve script (--reasoning-budget 0)
- Sanitize model names in eval run directories
- Simplify agentic setup to use requirements.txt
- Add serve --help test, batch flag assertions to existing tests
- Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
Felipe Cardoso
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
data/ data/
.venv/
*.log *.log
*.csv *.csv
*.tmp *.tmp

View File

@@ -41,9 +41,21 @@ make verify # 9-point optimization checklist
bin/audit --json | python3 -m json.tool # Verify JSON output is valid bin/audit --json | python3 -m json.tool # Verify JSON output is valid
``` ```
## Serving
`scripts/serve/launch.sh` with dispatcher at `bin/serve`. Launches llama-server inside toolbox containers with optimized defaults: Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload. Key flags:
- `--ngram` — n-gram speculative decoding (~1.1-1.4x tg for repetitive content)
- `--no-think` — disables thinking/reasoning via `--reasoning-budget 0` (faster for evals)
- `--ctx N` — context size (default 131072)
- `--parallel N` — concurrent request slots
## System Tuning
`scripts/optimize/power-profile.sh` applies Phase 2 optimizations: RyzenAdj PPT increase (85W target, HP caps at 70W sustained), sysctl tuning (vm.swappiness=1, vm.max_map_count=500000), THP=always, RADV_PERFTEST=nogttspill. Systemd services for boot/resume persistence at `configs/ryzenadj-llm.service` and `configs/ryzenadj-resume.service`.
## Agentic Evaluation ## Agentic Evaluation
Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `data/venv/`. Eval frameworks: inspect-ai (all-in-one), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint (ollama or llama.cpp server). Model catalog at `configs/models.conf`. Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `.venv/` (Python 3.13, dependencies in `requirements.txt`). Eval frameworks: inspect-ai (all-in-one), inspect-evals (task definitions), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint — auto-detects llama-server (port 8080) or ollama (port 11434). Model catalog at `configs/models.conf`.
## External Resources ## External Resources

View File

@@ -23,6 +23,7 @@ case "${1:-help}" in
echo " --category LIST Comma-separated: smoke,dense,moe" echo " --category LIST Comma-separated: smoke,dense,moe"
echo " --skip-longctx Skip long-context (32K) tests" echo " --skip-longctx Skip long-context (32K) tests"
echo " --reps N Standard test repetitions (default: 5)" echo " --reps N Standard test repetitions (default: 5)"
echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)"
echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)" echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)"
echo "" echo ""
echo "Examples:" echo "Examples:"

12
requirements.txt Normal file
View File

@@ -0,0 +1,12 @@
# Agentic evaluation frameworks
# Install: python3.13 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt
# Requires Python >=3.10, <3.14 (bigcodebench constraint)
inspect-ai>=0.3.201
inspect-evals>=0.6.0
evalplus>=0.3.1
bigcodebench>=0.2.5
openai>=2.26.0
# IFEval dependency (not on PyPI)
instruction_following_eval @ git+https://github.com/josejg/instruction_following_eval

View File

@@ -5,7 +5,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh" source "$SCRIPT_DIR/../../lib/common.sh"
VENV_DIR="$(data_dir venv)" VENV_DIR="$PROJECT_ROOT/.venv"
EVAL_DIR="$(data_dir evals)" EVAL_DIR="$(data_dir evals)"
# ── Argument parsing ───────────────────────────────────── # ── Argument parsing ─────────────────────────────────────
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
done done
# ── Validation ─────────────────────────────────────────── # ── Validation ───────────────────────────────────────────
if [[ -z "$MODEL" ]]; then
log_error "Model name required. Use --model NAME"
log_info "Examples:"
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
exit 1
fi
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
log_error "Virtual environment not found. Run: make agentic-setup" log_error "Virtual environment not found. Run: make agentic-setup"
exit 1 exit 1
fi fi
source "$VENV_DIR/bin/activate" source "$VENV_DIR/bin/activate"
# Check server is reachable # Auto-detect server if no explicit endpoint given
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
# Try ollama native endpoint if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then ENDPOINT="http://localhost:8080/v1"
log_info "Ollama detected, using OpenAI-compat endpoint" log_info "Auto-detected llama-server at localhost:8080"
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
log_info "Auto-detected ollama at localhost:11434"
else else
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first." log_error "No LLM server found. Start one first:"
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
log_info " ollama serve (ollama)"
exit 1
fi
else
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
log_error "No LLM server at $ENDPOINT"
exit 1
fi
fi
# Auto-detect model name from server if not provided
if [[ -z "$MODEL" ]]; then
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
models = data.get('data', [])
if models:
print(models[0].get('id', ''))
except: pass
" 2>/dev/null || true)
if [[ -n "$DETECTED_MODEL" ]]; then
MODEL="$DETECTED_MODEL"
log_info "Auto-detected model: $MODEL"
else
log_error "Model name required. Use --model NAME"
log_info "Examples:"
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
exit 1 exit 1
fi fi
fi fi
TS="$(timestamp)" TS="$(timestamp)"
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}" SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
mkdir -p "$RUN_DIR" mkdir -p "$RUN_DIR"
log_header "Agentic Evaluation: $SUITE" log_header "Agentic Evaluation: $SUITE"
@@ -86,7 +112,11 @@ ENDJSON
METRICS_FILE="$RUN_DIR/metrics.csv" METRICS_FILE="$RUN_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 & bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
METRICS_PID=$! METRICS_PID=$!
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT cleanup() {
kill "$METRICS_PID" 2>/dev/null || true
wait "$METRICS_PID" 2>/dev/null || true
}
trap 'cleanup; exit 0' EXIT
# ── Suite execution ────────────────────────────────────── # ── Suite execution ──────────────────────────────────────
@@ -113,14 +143,14 @@ run_evalplus() {
run_inspect_eval() { run_inspect_eval() {
local eval_name="$1" local eval_name="$1"
local display_name="$2" local display_name="$2"
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
log_info "Running Inspect AI: $display_name..." log_info "Running Inspect AI: $display_name..."
local out="$RUN_DIR/inspect-${eval_name}.json"
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \ OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
inspect eval "$eval_name" \ inspect eval "$eval_name" \
--model "openai/$MODEL" \ --model "openai/$MODEL" \
--log-dir "$RUN_DIR/inspect-logs/" \ --log-dir "$RUN_DIR/inspect-logs/" \
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log" 2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
log_success "Inspect $display_name complete" log_success "Inspect $display_name complete"
} }
@@ -138,7 +168,7 @@ run_bigcodebench() {
case "$SUITE" in case "$SUITE" in
quick) quick)
run_evalplus "humaneval" run_evalplus "humaneval"
run_inspect_eval "ifeval" "IFEval (instruction following)" run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
;; ;;
code) code)
run_evalplus "humaneval" run_evalplus "humaneval"
@@ -146,13 +176,13 @@ case "$SUITE" in
run_bigcodebench run_bigcodebench
;; ;;
tooluse) tooluse)
run_inspect_eval "bfcl" "BFCL (function calling)" run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
;; ;;
full) full)
run_evalplus "humaneval" run_evalplus "humaneval"
run_evalplus "mbpp" run_evalplus "mbpp"
run_inspect_eval "ifeval" "IFEval (instruction following)" run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
run_inspect_eval "bfcl" "BFCL (function calling)" run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
run_bigcodebench run_bigcodebench
;; ;;
*) *)

View File

@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
log_header "Agentic Evaluation Setup" log_header "Agentic Evaluation Setup"
# ── Python virtual environment ─────────────────────────── # ── Python virtual environment ───────────────────────────
VENV_DIR="$(data_dir venv)" VENV_DIR="$PROJECT_ROOT/.venv"
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
log_info "Creating Python virtual environment..." # Prefer Python 3.13 (bigcodebench requires <3.14)
python3 -m venv "$VENV_DIR" PYTHON_BIN="python3.13"
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
PYTHON_BIN="python3"
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
fi
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
"$PYTHON_BIN" -m venv "$VENV_DIR"
log_success "Virtual environment created at $VENV_DIR" log_success "Virtual environment created at $VENV_DIR"
fi fi
source "$VENV_DIR/bin/activate" source "$VENV_DIR/bin/activate"
log_info "Python: $(python3 --version) from $VENV_DIR" log_info "Python: $(python3 --version) from $VENV_DIR"
# ── Install evaluation frameworks ──────────────────────── # ── Install from requirements.txt ────────────────────────
log_info "Installing dependencies from requirements.txt..."
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.) pip install -r "$REQUIREMENTS" 2>&1 | tail -5
if python3 -c "import inspect_ai" 2>/dev/null; then log_success "Dependencies installed"
log_success "inspect-ai already installed"
else
log_info "Installing inspect-ai (main eval framework)..."
pip install inspect-ai 2>&1 | tail -3
log_success "inspect-ai installed"
fi
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
if python3 -c "import evalplus" 2>/dev/null; then
log_success "evalplus already installed"
else
log_info "Installing evalplus (code generation benchmarks)..."
pip install evalplus 2>&1 | tail -3
log_success "evalplus installed"
fi
# BigCodeBench
if python3 -c "import bigcodebench" 2>/dev/null; then
log_success "bigcodebench already installed"
else
log_info "Installing bigcodebench..."
pip install bigcodebench 2>&1 | tail -3
log_success "bigcodebench installed"
fi
# ── Check for local LLM server ────────────────────────── # ── Check for local LLM server ──────────────────────────
log_header "LLM Server Check" log_header "LLM Server Check"
ollama_ok=false if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
llamacpp_ok=false log_success "llama-server running at localhost:8080"
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
if is_cmd ollama; then log_success "ollama running at localhost:11434"
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
log_success "ollama running at localhost:11434"
ollama_ok=true
# List available models
log_info "Available ollama models:"
ollama list 2>/dev/null | head -10 || true
else
log_warn "ollama installed but not running. Start with: ollama serve"
fi
else else
log_info "ollama not installed — needed for most agentic benchmarks" log_warn "No local LLM server running. Start one before running evals:"
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh" log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
fi log_info " ollama serve (ollama)"
# Check for llama.cpp server
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
log_success "llama.cpp server running at localhost:8080"
llamacpp_ok=true
else
log_info "No llama.cpp server detected at localhost:8080"
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
fi
if ! $ollama_ok && ! $llamacpp_ok; then
log_warn "No local LLM server running. Agentic benchmarks need one."
fi fi
# ── Summary ────────────────────────────────────────────── # ── Summary ──────────────────────────────────────────────
log_header "Setup Complete" log_header "Setup Complete"
echo "" echo ""
echo " Installed tools:" echo " Installed tools:"
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)" echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
echo " evalplus — HumanEval+ / MBPP+ with native ollama support" echo " inspect-evals — Task definitions for inspect-ai"
echo " bigcodebench — 1,140 coding tasks across 139 libraries" echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
echo "" echo ""
echo " To activate the virtual environment:" echo " Activate venv: source .venv/bin/activate"
echo " source data/venv/bin/activate"
echo "" echo ""
echo " Run evaluations:" echo " Run evaluations:"
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)" echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)" echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
echo " make agentic-full # All of the above (~5-6 hours)"
echo "" echo ""

View File

@@ -270,7 +270,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
-r "$REPS_LONGCTX" "${KV_ARGS[@]}") -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
printf " cmd: %s\n" "${CMD_LC[*]}" printf " cmd: %s\n" "${CMD_LC[*]}"
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then

View File

@@ -252,7 +252,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}") -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Done"; tail -3 "$OUT_LC" log_success "Done"; tail -3 "$OUT_LC"
else else

View File

@@ -14,6 +14,7 @@ CTX_SIZE=131072
PARALLEL=1 PARALLEL=1
MODEL="" MODEL=""
NGRAM=false NGRAM=false
NO_THINK=false
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
--ctx) CTX_SIZE="$2"; shift 2 ;; --ctx) CTX_SIZE="$2"; shift 2 ;;
--parallel) PARALLEL="$2"; shift 2 ;; --parallel) PARALLEL="$2"; shift 2 ;;
--ngram) NGRAM=true; shift ;; --ngram) NGRAM=true; shift ;;
--no-think) NO_THINK=true; shift ;;
--help|-h) --help|-h)
echo "Usage: launch.sh [OPTIONS]" echo "Usage: launch.sh [OPTIONS]"
echo "" echo ""
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo " --ctx N Context size (default: 131072)" echo " --ctx N Context size (default: 131072)"
echo " --parallel N Parallel request slots (default: 1)" echo " --parallel N Parallel request slots (default: 1)"
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)" echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
echo " --no-think Disable thinking/reasoning (faster for evals)"
echo "" echo ""
echo "Presets (pass model filename):" echo "Presets (pass model filename):"
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver" echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
@@ -101,7 +104,7 @@ fi
SERVER_ARGS=( SERVER_ARGS=(
-ngl 99 # Full GPU offload -ngl 99 # Full GPU offload
--no-mmap # Direct load, no mmap overhead --no-mmap # Direct load, no mmap overhead
-fa # Flash attention -fa on # Flash attention
-m "$TOOLBOX_MODEL_PATH" -m "$TOOLBOX_MODEL_PATH"
-c "$CTX_SIZE" # Context size -c "$CTX_SIZE" # Context size
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan) --cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
@@ -110,6 +113,11 @@ SERVER_ARGS=(
-np "$PARALLEL" # Parallel slots -np "$PARALLEL" # Parallel slots
) )
# Disable thinking mode (faster for evals)
if $NO_THINK; then
SERVER_ARGS+=(--reasoning-budget 0)
fi
# N-gram speculative decoding # N-gram speculative decoding
if $NGRAM; then if $NGRAM; then
SERVER_ARGS+=( SERVER_ARGS+=(
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
log_info "Context: $CTX_SIZE tokens" log_info "Context: $CTX_SIZE tokens"
log_info "KV cache: q4_0/q4_0" log_info "KV cache: q4_0/q4_0"
log_info "Parallel slots: $PARALLEL" log_info "Parallel slots: $PARALLEL"
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)" $NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
log_info "Port: $PORT" log_info "Port: $PORT"
log_info "Endpoint: http://localhost:$PORT" log_info "Endpoint: http://localhost:$PORT"

View File

@@ -11,6 +11,7 @@ load test_helper.sh
assert_output --partial "--category" assert_output --partial "--category"
assert_output --partial "--skip-longctx" assert_output --partial "--skip-longctx"
assert_output --partial "--kv-types" assert_output --partial "--kv-types"
assert_output --partial "--batch"
} }
@test "run-suite --help shows usage and exits 0" { @test "run-suite --help shows usage and exits 0" {
@@ -22,6 +23,7 @@ load test_helper.sh
assert_output --partial "--skip-longctx" assert_output --partial "--skip-longctx"
assert_output --partial "--tag" assert_output --partial "--tag"
assert_output --partial "--kv-types" assert_output --partial "--kv-types"
assert_output --partial "--batch"
} }
@test "benchmark dispatcher shows help with no args" { @test "benchmark dispatcher shows help with no args" {
@@ -31,6 +33,18 @@ load test_helper.sh
assert_output --partial "--max-size" assert_output --partial "--max-size"
assert_output --partial "--skip-longctx" assert_output --partial "--skip-longctx"
assert_output --partial "--kv-types" assert_output --partial "--kv-types"
assert_output --partial "--batch"
}
@test "serve --help shows usage and exits 0" {
run bash "$PROJECT_ROOT/bin/serve" --help
assert_success
assert_output --partial "Usage"
assert_output --partial "--model"
assert_output --partial "--ngram"
assert_output --partial "--no-think"
assert_output --partial "--ctx"
assert_output --partial "--port"
} }
@test "benchmark dispatcher passes --help through to baseline" { @test "benchmark dispatcher passes --help through to baseline" {