fix: address code review findings — batch args, venv path, serve flags
- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
@@ -5,7 +5,7 @@ set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
|
||||
VENV_DIR="$(data_dir venv)"
|
||||
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||
EVAL_DIR="$(data_dir evals)"
|
||||
|
||||
# ── Argument parsing ─────────────────────────────────────
|
||||
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
|
||||
done
|
||||
|
||||
# ── Validation ───────────────────────────────────────────
|
||||
if [[ -z "$MODEL" ]]; then
|
||||
log_error "Model name required. Use --model NAME"
|
||||
log_info "Examples:"
|
||||
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
||||
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||
log_error "Virtual environment not found. Run: make agentic-setup"
|
||||
exit 1
|
||||
fi
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# Check server is reachable
|
||||
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
||||
# Try ollama native endpoint
|
||||
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
||||
log_info "Ollama detected, using OpenAI-compat endpoint"
|
||||
# Auto-detect server if no explicit endpoint given
|
||||
if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
|
||||
if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
|
||||
ENDPOINT="http://localhost:8080/v1"
|
||||
log_info "Auto-detected llama-server at localhost:8080"
|
||||
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
|
||||
log_info "Auto-detected ollama at localhost:11434"
|
||||
else
|
||||
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
|
||||
log_error "No LLM server found. Start one first:"
|
||||
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||
log_info " ollama serve (ollama)"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
|
||||
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
|
||||
log_error "No LLM server at $ENDPOINT"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Auto-detect model name from server if not provided
|
||||
if [[ -z "$MODEL" ]]; then
|
||||
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
models = data.get('data', [])
|
||||
if models:
|
||||
print(models[0].get('id', ''))
|
||||
except: pass
|
||||
" 2>/dev/null || true)
|
||||
if [[ -n "$DETECTED_MODEL" ]]; then
|
||||
MODEL="$DETECTED_MODEL"
|
||||
log_info "Auto-detected model: $MODEL"
|
||||
else
|
||||
log_error "Model name required. Use --model NAME"
|
||||
log_info "Examples:"
|
||||
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
|
||||
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
TS="$(timestamp)"
|
||||
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
|
||||
SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
|
||||
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
log_header "Agentic Evaluation: $SUITE"
|
||||
@@ -86,7 +112,11 @@ ENDJSON
|
||||
METRICS_FILE="$RUN_DIR/metrics.csv"
|
||||
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
|
||||
METRICS_PID=$!
|
||||
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
|
||||
cleanup() {
|
||||
kill "$METRICS_PID" 2>/dev/null || true
|
||||
wait "$METRICS_PID" 2>/dev/null || true
|
||||
}
|
||||
trap 'cleanup; exit 0' EXIT
|
||||
|
||||
# ── Suite execution ──────────────────────────────────────
|
||||
|
||||
@@ -113,14 +143,14 @@ run_evalplus() {
|
||||
run_inspect_eval() {
|
||||
local eval_name="$1"
|
||||
local display_name="$2"
|
||||
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
|
||||
log_info "Running Inspect AI: $display_name..."
|
||||
local out="$RUN_DIR/inspect-${eval_name}.json"
|
||||
|
||||
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
|
||||
inspect eval "$eval_name" \
|
||||
--model "openai/$MODEL" \
|
||||
--log-dir "$RUN_DIR/inspect-logs/" \
|
||||
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
|
||||
2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
|
||||
|
||||
log_success "Inspect $display_name complete"
|
||||
}
|
||||
@@ -138,7 +168,7 @@ run_bigcodebench() {
|
||||
case "$SUITE" in
|
||||
quick)
|
||||
run_evalplus "humaneval"
|
||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
||||
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||
;;
|
||||
code)
|
||||
run_evalplus "humaneval"
|
||||
@@ -146,13 +176,13 @@ case "$SUITE" in
|
||||
run_bigcodebench
|
||||
;;
|
||||
tooluse)
|
||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
||||
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||
;;
|
||||
full)
|
||||
run_evalplus "humaneval"
|
||||
run_evalplus "mbpp"
|
||||
run_inspect_eval "ifeval" "IFEval (instruction following)"
|
||||
run_inspect_eval "bfcl" "BFCL (function calling)"
|
||||
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
|
||||
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
|
||||
run_bigcodebench
|
||||
;;
|
||||
*)
|
||||
|
||||
@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
log_header "Agentic Evaluation Setup"
|
||||
|
||||
# ── Python virtual environment ───────────────────────────
|
||||
VENV_DIR="$(data_dir venv)"
|
||||
VENV_DIR="$PROJECT_ROOT/.venv"
|
||||
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
|
||||
|
||||
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
||||
log_info "Creating Python virtual environment..."
|
||||
python3 -m venv "$VENV_DIR"
|
||||
# Prefer Python 3.13 (bigcodebench requires <3.14)
|
||||
PYTHON_BIN="python3.13"
|
||||
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
||||
PYTHON_BIN="python3"
|
||||
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
|
||||
fi
|
||||
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
|
||||
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
||||
log_success "Virtual environment created at $VENV_DIR"
|
||||
fi
|
||||
|
||||
source "$VENV_DIR/bin/activate"
|
||||
log_info "Python: $(python3 --version) from $VENV_DIR"
|
||||
|
||||
# ── Install evaluation frameworks ────────────────────────
|
||||
|
||||
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
|
||||
if python3 -c "import inspect_ai" 2>/dev/null; then
|
||||
log_success "inspect-ai already installed"
|
||||
else
|
||||
log_info "Installing inspect-ai (main eval framework)..."
|
||||
pip install inspect-ai 2>&1 | tail -3
|
||||
log_success "inspect-ai installed"
|
||||
fi
|
||||
|
||||
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
|
||||
if python3 -c "import evalplus" 2>/dev/null; then
|
||||
log_success "evalplus already installed"
|
||||
else
|
||||
log_info "Installing evalplus (code generation benchmarks)..."
|
||||
pip install evalplus 2>&1 | tail -3
|
||||
log_success "evalplus installed"
|
||||
fi
|
||||
|
||||
# BigCodeBench
|
||||
if python3 -c "import bigcodebench" 2>/dev/null; then
|
||||
log_success "bigcodebench already installed"
|
||||
else
|
||||
log_info "Installing bigcodebench..."
|
||||
pip install bigcodebench 2>&1 | tail -3
|
||||
log_success "bigcodebench installed"
|
||||
fi
|
||||
# ── Install from requirements.txt ────────────────────────
|
||||
log_info "Installing dependencies from requirements.txt..."
|
||||
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
|
||||
log_success "Dependencies installed"
|
||||
|
||||
# ── Check for local LLM server ──────────────────────────
|
||||
log_header "LLM Server Check"
|
||||
|
||||
ollama_ok=false
|
||||
llamacpp_ok=false
|
||||
|
||||
if is_cmd ollama; then
|
||||
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||
log_success "ollama running at localhost:11434"
|
||||
ollama_ok=true
|
||||
# List available models
|
||||
log_info "Available ollama models:"
|
||||
ollama list 2>/dev/null | head -10 || true
|
||||
else
|
||||
log_warn "ollama installed but not running. Start with: ollama serve"
|
||||
fi
|
||||
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
|
||||
log_success "llama-server running at localhost:8080"
|
||||
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
|
||||
log_success "ollama running at localhost:11434"
|
||||
else
|
||||
log_info "ollama not installed — needed for most agentic benchmarks"
|
||||
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
|
||||
fi
|
||||
|
||||
# Check for llama.cpp server
|
||||
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
|
||||
log_success "llama.cpp server running at localhost:8080"
|
||||
llamacpp_ok=true
|
||||
else
|
||||
log_info "No llama.cpp server detected at localhost:8080"
|
||||
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
|
||||
fi
|
||||
|
||||
if ! $ollama_ok && ! $llamacpp_ok; then
|
||||
log_warn "No local LLM server running. Agentic benchmarks need one."
|
||||
log_warn "No local LLM server running. Start one before running evals:"
|
||||
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
||||
log_info " ollama serve (ollama)"
|
||||
fi
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────
|
||||
log_header "Setup Complete"
|
||||
echo ""
|
||||
echo " Installed tools:"
|
||||
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
|
||||
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
||||
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
||||
echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
|
||||
echo " inspect-evals — Task definitions for inspect-ai"
|
||||
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
||||
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
||||
echo ""
|
||||
echo " To activate the virtual environment:"
|
||||
echo " source data/venv/bin/activate"
|
||||
echo " Activate venv: source .venv/bin/activate"
|
||||
echo ""
|
||||
echo " Run evaluations:"
|
||||
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
|
||||
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
|
||||
echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
|
||||
echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
|
||||
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
|
||||
echo " make agentic-full # All of the above (~5-6 hours)"
|
||||
echo ""
|
||||
|
||||
@@ -270,7 +270,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE"
|
||||
-r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||
-r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||
|
||||
printf " cmd: %s\n" "${CMD_LC[*]}"
|
||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||
|
||||
@@ -252,7 +252,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
|
||||
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
|
||||
-ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1
|
||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}")
|
||||
-p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}")
|
||||
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
|
||||
log_success "Done"; tail -3 "$OUT_LC"
|
||||
else
|
||||
|
||||
@@ -14,6 +14,7 @@ CTX_SIZE=131072
|
||||
PARALLEL=1
|
||||
MODEL=""
|
||||
NGRAM=false
|
||||
NO_THINK=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do
|
||||
--ctx) CTX_SIZE="$2"; shift 2 ;;
|
||||
--parallel) PARALLEL="$2"; shift 2 ;;
|
||||
--ngram) NGRAM=true; shift ;;
|
||||
--no-think) NO_THINK=true; shift ;;
|
||||
--help|-h)
|
||||
echo "Usage: launch.sh [OPTIONS]"
|
||||
echo ""
|
||||
@@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do
|
||||
echo " --ctx N Context size (default: 131072)"
|
||||
echo " --parallel N Parallel request slots (default: 1)"
|
||||
echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)"
|
||||
echo " --no-think Disable thinking/reasoning (faster for evals)"
|
||||
echo ""
|
||||
echo "Presets (pass model filename):"
|
||||
echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver"
|
||||
@@ -101,7 +104,7 @@ fi
|
||||
SERVER_ARGS=(
|
||||
-ngl 99 # Full GPU offload
|
||||
--no-mmap # Direct load, no mmap overhead
|
||||
-fa # Flash attention
|
||||
-fa on # Flash attention
|
||||
-m "$TOOLBOX_MODEL_PATH"
|
||||
-c "$CTX_SIZE" # Context size
|
||||
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||
@@ -110,6 +113,11 @@ SERVER_ARGS=(
|
||||
-np "$PARALLEL" # Parallel slots
|
||||
)
|
||||
|
||||
# Disable thinking mode (faster for evals)
|
||||
if $NO_THINK; then
|
||||
SERVER_ARGS+=(--reasoning-budget 0)
|
||||
fi
|
||||
|
||||
# N-gram speculative decoding
|
||||
if $NGRAM; then
|
||||
SERVER_ARGS+=(
|
||||
@@ -126,6 +134,7 @@ log_info "Backend: $BACKEND"
|
||||
log_info "Context: $CTX_SIZE tokens"
|
||||
log_info "KV cache: q4_0/q4_0"
|
||||
log_info "Parallel slots: $PARALLEL"
|
||||
$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)"
|
||||
$NGRAM && log_info "N-gram speculative: enabled (draft-max=64)"
|
||||
log_info "Port: $PORT"
|
||||
log_info "Endpoint: http://localhost:$PORT"
|
||||
|
||||
Reference in New Issue
Block a user