fix: address code review findings — batch args, venv path, serve flags

- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts)
- Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs
- Add -b/--batch to bin/benchmark help text
- Add --no-think flag to serve script (--reasoning-budget 0)
- Sanitize model names in eval run directories
- Simplify agentic setup to use requirements.txt
- Add serve --help test, batch flag assertions to existing tests
- Add requirements.txt for reproducible venv setup (Python 3.13)
This commit is contained in:
Felipe Cardoso
2026-03-31 10:10:48 +02:00
parent dd403a907c
commit 6ab08537ca
10 changed files with 137 additions and 93 deletions

View File

@@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh"
log_header "Agentic Evaluation Setup"
# ── Python virtual environment ───────────────────────────
VENV_DIR="$(data_dir venv)"
VENV_DIR="$PROJECT_ROOT/.venv"
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
log_info "Creating Python virtual environment..."
python3 -m venv "$VENV_DIR"
# Prefer Python 3.13 (bigcodebench requires <3.14)
PYTHON_BIN="python3.13"
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
PYTHON_BIN="python3"
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
fi
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
"$PYTHON_BIN" -m venv "$VENV_DIR"
log_success "Virtual environment created at $VENV_DIR"
fi
source "$VENV_DIR/bin/activate"
log_info "Python: $(python3 --version) from $VENV_DIR"
# ── Install evaluation frameworks ────────────────────────
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
if python3 -c "import inspect_ai" 2>/dev/null; then
log_success "inspect-ai already installed"
else
log_info "Installing inspect-ai (main eval framework)..."
pip install inspect-ai 2>&1 | tail -3
log_success "inspect-ai installed"
fi
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
if python3 -c "import evalplus" 2>/dev/null; then
log_success "evalplus already installed"
else
log_info "Installing evalplus (code generation benchmarks)..."
pip install evalplus 2>&1 | tail -3
log_success "evalplus installed"
fi
# BigCodeBench
if python3 -c "import bigcodebench" 2>/dev/null; then
log_success "bigcodebench already installed"
else
log_info "Installing bigcodebench..."
pip install bigcodebench 2>&1 | tail -3
log_success "bigcodebench installed"
fi
# ── Install from requirements.txt ────────────────────────
log_info "Installing dependencies from requirements.txt..."
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
log_success "Dependencies installed"
# ── Check for local LLM server ──────────────────────────
log_header "LLM Server Check"
ollama_ok=false
llamacpp_ok=false
if is_cmd ollama; then
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
log_success "ollama running at localhost:11434"
ollama_ok=true
# List available models
log_info "Available ollama models:"
ollama list 2>/dev/null | head -10 || true
else
log_warn "ollama installed but not running. Start with: ollama serve"
fi
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
log_success "llama-server running at localhost:8080"
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
log_success "ollama running at localhost:11434"
else
log_info "ollama not installed — needed for most agentic benchmarks"
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
fi
# Check for llama.cpp server
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
log_success "llama.cpp server running at localhost:8080"
llamacpp_ok=true
else
log_info "No llama.cpp server detected at localhost:8080"
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
fi
if ! $ollama_ok && ! $llamacpp_ok; then
log_warn "No local LLM server running. Agentic benchmarks need one."
log_warn "No local LLM server running. Start one before running evals:"
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
log_info " ollama serve (ollama)"
fi
# ── Summary ──────────────────────────────────────────────
log_header "Setup Complete"
echo ""
echo " Installed tools:"
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
echo " inspect-evals — Task definitions for inspect-ai"
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
echo ""
echo " To activate the virtual environment:"
echo " source data/venv/bin/activate"
echo " Activate venv: source .venv/bin/activate"
echo ""
echo " Run evaluations:"
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
echo " make agentic-full # All of the above (~5-6 hours)"
echo ""