fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions
--- a/scripts/agentic/run-eval.sh
+++ b/scripts/agentic/run-eval.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "$SCRIPT_DIR/../../lib/common.sh"

-VENV_DIR="$(data_dir venv)"
+VENV_DIR="$PROJECT_ROOT/.venv"
 EVAL_DIR="$(data_dir evals)"

 # ── Argument parsing ─────────────────────────────────────
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
 done

 # ── Validation ───────────────────────────────────────────
-if [[ -z "$MODEL" ]]; then
-    log_error "Model name required. Use --model NAME"
-    log_info "Examples:"
-    log_info "  --model qwen3.5:35b-a3b-q8_0       (ollama)"
-    log_info "  --model Qwen3.5-35B-A3B-Q8_0        (llama.cpp server)"
-    exit 1
-fi
-
 if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
    log_error "Virtual environment not found. Run: make agentic-setup"
    exit 1
 fi
 source "$VENV_DIR/bin/activate"

-# Check server is reachable
-if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
-    # Try ollama native endpoint
-    if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
-        log_info "Ollama detected, using OpenAI-compat endpoint"
+# Auto-detect server if no explicit endpoint given
+if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
+    if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
+        ENDPOINT="http://localhost:8080/v1"
+        log_info "Auto-detected llama-server at localhost:8080"
+    elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
+        log_info "Auto-detected ollama at localhost:11434"
    else
-        log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
+        log_error "No LLM server found. Start one first:"
+        log_info "  make serve ARGS=\"-m MODEL.gguf\"        (llama-server)"
+        log_info "  ollama serve                            (ollama)"
+        exit 1
+    fi
+else
+    if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
+       ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
+        log_error "No LLM server at $ENDPOINT"
+        exit 1
+    fi
+fi
+
+# Auto-detect model name from server if not provided
+if [[ -z "$MODEL" ]]; then
+    DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
+import sys, json
+try:
+    data = json.load(sys.stdin)
+    models = data.get('data', [])
+    if models:
+        print(models[0].get('id', ''))
+except: pass
+" 2>/dev/null || true)
+    if [[ -n "$DETECTED_MODEL" ]]; then
+        MODEL="$DETECTED_MODEL"
+        log_info "Auto-detected model: $MODEL"
+    else
+        log_error "Model name required. Use --model NAME"
+        log_info "Examples:"
+        log_info "  --model qwen3.5:35b-a3b-q8_0       (ollama)"
+        log_info "  --model Qwen3.5-35B-A3B-Q8_0        (llama.cpp server)"
        exit 1
    fi
 fi

 TS="$(timestamp)"
-RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
+SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
+RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
 mkdir -p "$RUN_DIR"

 log_header "Agentic Evaluation: $SUITE"
@@ -86,7 +112,11 @@ ENDJSON
 METRICS_FILE="$RUN_DIR/metrics.csv"
 bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
 METRICS_PID=$!
-trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
+cleanup() {
+    kill "$METRICS_PID" 2>/dev/null || true
+    wait "$METRICS_PID" 2>/dev/null || true
+}
+trap 'cleanup; exit 0' EXIT

 # ── Suite execution ──────────────────────────────────────

@@ -113,14 +143,14 @@ run_evalplus() {
 run_inspect_eval() {
    local eval_name="$1"
    local display_name="$2"
+    local safe_name="${eval_name//\//_}"  # inspect_evals/ifeval → inspect_evals_ifeval
    log_info "Running Inspect AI: $display_name..."
-    local out="$RUN_DIR/inspect-${eval_name}.json"

    OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
    inspect eval "$eval_name" \
        --model "openai/$MODEL" \
        --log-dir "$RUN_DIR/inspect-logs/" \
-        2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
+        2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"

    log_success "Inspect $display_name complete"
 }
@@ -138,7 +168,7 @@ run_bigcodebench() {
 case "$SUITE" in
    quick)
        run_evalplus "humaneval"
-        run_inspect_eval "ifeval" "IFEval (instruction following)"
+        run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
        ;;
    code)
        run_evalplus "humaneval"
@@ -146,13 +176,13 @@ case "$SUITE" in
        run_bigcodebench
        ;;
    tooluse)
-        run_inspect_eval "bfcl" "BFCL (function calling)"
+        run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
        ;;
    full)
        run_evalplus "humaneval"
        run_evalplus "mbpp"
-        run_inspect_eval "ifeval" "IFEval (instruction following)"
-        run_inspect_eval "bfcl" "BFCL (function calling)"
+        run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
+        run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
        run_bigcodebench
        ;;
    *)
--- a/scripts/agentic/setup.sh
+++ b/scripts/agentic/setup.sh
@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
 log_header "Agentic Evaluation Setup"

 # ── Python virtual environment ───────────────────────────
-VENV_DIR="$(data_dir venv)"
+VENV_DIR="$PROJECT_ROOT/.venv"
+REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
+
 if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
-    log_info "Creating Python virtual environment..."
-    python3 -m venv "$VENV_DIR"
+    # Prefer Python 3.13 (bigcodebench requires <3.14)
+    PYTHON_BIN="python3.13"
+    if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
+        PYTHON_BIN="python3"
+        log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
+    fi
+    log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
+    "$PYTHON_BIN" -m venv "$VENV_DIR"
    log_success "Virtual environment created at $VENV_DIR"
 fi

 source "$VENV_DIR/bin/activate"
 log_info "Python: $(python3 --version) from $VENV_DIR"

-# ── Install evaluation frameworks ────────────────────────
-
-# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
-if python3 -c "import inspect_ai" 2>/dev/null; then
-    log_success "inspect-ai already installed"
-else
-    log_info "Installing inspect-ai (main eval framework)..."
-    pip install inspect-ai 2>&1 | tail -3
-    log_success "inspect-ai installed"
-fi
-
-# EvalPlus — HumanEval+ and MBPP+ with native ollama support
-if python3 -c "import evalplus" 2>/dev/null; then
-    log_success "evalplus already installed"
-else
-    log_info "Installing evalplus (code generation benchmarks)..."
-    pip install evalplus 2>&1 | tail -3
-    log_success "evalplus installed"
-fi
-
-# BigCodeBench
-if python3 -c "import bigcodebench" 2>/dev/null; then
-    log_success "bigcodebench already installed"
-else
-    log_info "Installing bigcodebench..."
-    pip install bigcodebench 2>&1 | tail -3
-    log_success "bigcodebench installed"
-fi
+# ── Install from requirements.txt ────────────────────────
+log_info "Installing dependencies from requirements.txt..."
+pip install -r "$REQUIREMENTS" 2>&1 | tail -5
+log_success "Dependencies installed"

 # ── Check for local LLM server ──────────────────────────
 log_header "LLM Server Check"

-ollama_ok=false
-llamacpp_ok=false
-
-if is_cmd ollama; then
-    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
-        log_success "ollama running at localhost:11434"
-        ollama_ok=true
-        # List available models
-        log_info "Available ollama models:"
-        ollama list 2>/dev/null | head -10 || true
-    else
-        log_warn "ollama installed but not running. Start with: ollama serve"
-    fi
+if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
+    log_success "llama-server running at localhost:8080"
+elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
+    log_success "ollama running at localhost:11434"
 else
-    log_info "ollama not installed — needed for most agentic benchmarks"
-    log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
-fi
-
-# Check for llama.cpp server
-if curl -s http://localhost:8080/health >/dev/null 2>&1; then
-    log_success "llama.cpp server running at localhost:8080"
-    llamacpp_ok=true
-else
-    log_info "No llama.cpp server detected at localhost:8080"
-    log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
-fi
-
-if ! $ollama_ok && ! $llamacpp_ok; then
-    log_warn "No local LLM server running. Agentic benchmarks need one."
+    log_warn "No local LLM server running. Start one before running evals:"
+    log_info "  make serve ARGS=\"-m MODEL.gguf\"        (llama-server)"
+    log_info "  ollama serve                            (ollama)"
 fi

 # ── Summary ──────────────────────────────────────────────
 log_header "Setup Complete"
 echo ""
 echo "  Installed tools:"
-echo "    inspect-ai     — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
-echo "    evalplus        — HumanEval+ / MBPP+ with native ollama support"
-echo "    bigcodebench    — 1,140 coding tasks across 139 libraries"
+echo "    inspect-ai      — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
+echo "    inspect-evals   — Task definitions for inspect-ai"
+echo "    evalplus         — HumanEval+ / MBPP+ with native ollama support"
+echo "    bigcodebench     — 1,140 coding tasks across 139 libraries"
 echo ""
-echo "  To activate the virtual environment:"
-echo "    source data/venv/bin/activate"
+echo "  Activate venv:  source .venv/bin/activate"
 echo ""
 echo "  Run evaluations:"
-echo "    make agentic-quick      # EvalPlus + IFEval (~1 hour)"
-echo "    make agentic-full       # BFCL + BigCodeBench (~3-4 hours)"
+echo "    make agentic-quick      # EvalPlus HumanEval+ + IFEval (~1 hour)"
+echo "    make agentic-code       # EvalPlus + BigCodeBench (~2-3 hours)"
+echo "    make agentic-tooluse    # BFCL function calling (~1-2 hours)"
+echo "    make agentic-full       # All of the above (~5-6 hours)"
 echo ""