- Fix missing BATCH_ARGS in long-context commands (both benchmark scripts) - Fix CLAUDE.md stale venv path (data/venv → .venv) and add serve/power docs - Add -b/--batch to bin/benchmark help text - Add --no-think flag to serve script (--reasoning-budget 0) - Sanitize model names in eval run directories - Simplify agentic setup to use requirements.txt - Add serve --help test, batch flag assertions to existing tests - Add requirements.txt for reproducible venv setup (Python 3.13)
64 lines
2.7 KiB
Bash
64 lines
2.7 KiB
Bash
#!/usr/bin/env bash
|
|
# Setup agentic evaluation tools
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
|
|
|
log_header "Agentic Evaluation Setup"
|
|
|
|
# ── Python virtual environment ───────────────────────────
|
|
VENV_DIR="$PROJECT_ROOT/.venv"
|
|
REQUIREMENTS="$PROJECT_ROOT/requirements.txt"
|
|
|
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
|
# Prefer Python 3.13 (bigcodebench requires <3.14)
|
|
PYTHON_BIN="python3.13"
|
|
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
|
PYTHON_BIN="python3"
|
|
log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install."
|
|
fi
|
|
log_info "Creating virtual environment with $($PYTHON_BIN --version)..."
|
|
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
|
log_success "Virtual environment created at $VENV_DIR"
|
|
fi
|
|
|
|
source "$VENV_DIR/bin/activate"
|
|
log_info "Python: $(python3 --version) from $VENV_DIR"
|
|
|
|
# ── Install from requirements.txt ────────────────────────
|
|
log_info "Installing dependencies from requirements.txt..."
|
|
pip install -r "$REQUIREMENTS" 2>&1 | tail -5
|
|
log_success "Dependencies installed"
|
|
|
|
# ── Check for local LLM server ──────────────────────────
|
|
log_header "LLM Server Check"
|
|
|
|
if curl -sf http://localhost:8080/health >/dev/null 2>&1; then
|
|
log_success "llama-server running at localhost:8080"
|
|
elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then
|
|
log_success "ollama running at localhost:11434"
|
|
else
|
|
log_warn "No local LLM server running. Start one before running evals:"
|
|
log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)"
|
|
log_info " ollama serve (ollama)"
|
|
fi
|
|
|
|
# ── Summary ──────────────────────────────────────────────
|
|
log_header "Setup Complete"
|
|
echo ""
|
|
echo " Installed tools:"
|
|
echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)"
|
|
echo " inspect-evals — Task definitions for inspect-ai"
|
|
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
|
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
|
echo ""
|
|
echo " Activate venv: source .venv/bin/activate"
|
|
echo ""
|
|
echo " Run evaluations:"
|
|
echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)"
|
|
echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)"
|
|
echo " make agentic-tooluse # BFCL function calling (~1-2 hours)"
|
|
echo " make agentic-full # All of the above (~5-6 hours)"
|
|
echo ""
|