fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts)
- Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs
- Add -b/--batch to bin/benchmark help text
- Add --no-think flag to serve script (--reasoning-budget 0)
- Sanitize model names in eval run directories
- Simplify agentic setup to use requirements.txt
- Add serve --help test, batch flag assertions to existing tests
- Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
Felipe Cardoso
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions

View File

@@ -5,7 +5,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
VENV_DIR="$(data_dir venv)"
VENV_DIR="$PROJECT_ROOT/.venv"
EVAL_DIR="$(data_dir evals)"
# ── Argument parsing ─────────────────────────────────────
@@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do
done
# ── Validation ───────────────────────────────────────────
if [[ -z "$MODEL" ]]; then
log_error "Model name required. Use --model NAME"
log_info "Examples:"
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
exit 1
fi
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
log_error "Virtual environment not found. Run: make agentic-setup"
exit 1
fi
source "$VENV_DIR/bin/activate"
# Check server is reachable
if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
# Try ollama native endpoint
if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
log_info "Ollama detected, using OpenAI-compat endpoint"
# Auto-detect server if no explicit endpoint given
if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then
if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then
ENDPOINT="http://localhost:8080/v1"
log_info "Auto-detected llama-server at localhost:8080"
elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then
log_info "Auto-detected ollama at localhost:11434"
else
log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first."
log_error "No LLM server found. Start one first:"
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
log_info " ollama serve (ollama)"
exit 1
fi
else
if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \
! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then
log_error "No LLM server at $ENDPOINT"
exit 1
fi
fi
# Auto-detect model name from server if not provided
if [[ -z "$MODEL" ]]; then
DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
models = data.get('data', [])
if models:
print(models[0].get('id', ''))
except: pass
" 2>/dev/null || true)
if [[ -n "$DETECTED_MODEL" ]]; then
MODEL="$DETECTED_MODEL"
log_info "Auto-detected model: $MODEL"
else
log_error "Model name required. Use --model NAME"
log_info "Examples:"
log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)"
log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)"
exit 1
fi
fi
TS="$(timestamp)"
RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}"
SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')"
RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}"
mkdir -p "$RUN_DIR"
log_header "Agentic Evaluation: $SUITE"
@@ -86,7 +112,11 @@ ENDJSON
METRICS_FILE="$RUN_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 &
METRICS_PID=$!
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
cleanup() {
kill "$METRICS_PID" 2>/dev/null || true
wait "$METRICS_PID" 2>/dev/null || true
}
trap 'cleanup; exit 0' EXIT
# ── Suite execution ──────────────────────────────────────
@@ -113,14 +143,14 @@ run_evalplus() {
run_inspect_eval() {
local eval_name="$1"
local display_name="$2"
local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval
log_info "Running Inspect AI: $display_name..."
local out="$RUN_DIR/inspect-${eval_name}.json"
OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \
inspect eval "$eval_name" \
--model "openai/$MODEL" \
--log-dir "$RUN_DIR/inspect-logs/" \
2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log"
2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log"
log_success "Inspect $display_name complete"
}
@@ -138,7 +168,7 @@ run_bigcodebench() {
case "$SUITE" in
quick)
run_evalplus "humaneval"
run_inspect_eval "ifeval" "IFEval (instruction following)"
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
;;
code)
run_evalplus "humaneval"
@@ -146,13 +176,13 @@ case "$SUITE" in
run_bigcodebench
;;
tooluse)
run_inspect_eval "bfcl" "BFCL (function calling)"
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
;;
full)
run_evalplus "humaneval"
run_evalplus "mbpp"
run_inspect_eval "ifeval" "IFEval (instruction following)"
run_inspect_eval "bfcl" "BFCL (function calling)"
run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)"
run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)"
run_bigcodebench
;;
*)