Models: - configs/models.conf: catalog with Qwen3.5-35B-A3B (MoE, top pick), Qwen3.5-27B (dense), Qwen3-Coder-30B-A3B (agentic/coding) - Updated benchmark setup to show catalog with download status - docs/model-recommendations.md: memory planning, quantization guide Agentic evaluation: - scripts/agentic/setup.sh: installs inspect-ai, evalplus, bigcodebench in a Python venv - scripts/agentic/run-eval.sh: runs evaluations against local LLM server (ollama or llama.cpp). Suites: quick (HumanEval+IFEval), code (EvalPlus+BigCodeBench), tooluse (BFCL), full (all) - bin/agentic: dispatcher with help - docs/agentic-benchmarks.md: methodology, framework comparison, model recommendations for agentic use Updated: Makefile (6 new targets), README, CLAUDE.md, docs/references.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
99 lines
3.5 KiB
Bash
99 lines
3.5 KiB
Bash
#!/usr/bin/env bash
|
|
# Setup agentic evaluation tools
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/../../lib/common.sh"
|
|
|
|
log_header "Agentic Evaluation Setup"
|
|
|
|
# ── Python virtual environment ───────────────────────────
|
|
VENV_DIR="$(data_dir venv)"
|
|
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
|
|
log_info "Creating Python virtual environment..."
|
|
python3 -m venv "$VENV_DIR"
|
|
log_success "Virtual environment created at $VENV_DIR"
|
|
fi
|
|
|
|
source "$VENV_DIR/bin/activate"
|
|
log_info "Python: $(python3 --version) from $VENV_DIR"
|
|
|
|
# ── Install evaluation frameworks ────────────────────────
|
|
|
|
# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.)
|
|
if python3 -c "import inspect_ai" 2>/dev/null; then
|
|
log_success "inspect-ai already installed"
|
|
else
|
|
log_info "Installing inspect-ai (main eval framework)..."
|
|
pip install inspect-ai 2>&1 | tail -3
|
|
log_success "inspect-ai installed"
|
|
fi
|
|
|
|
# EvalPlus — HumanEval+ and MBPP+ with native ollama support
|
|
if python3 -c "import evalplus" 2>/dev/null; then
|
|
log_success "evalplus already installed"
|
|
else
|
|
log_info "Installing evalplus (code generation benchmarks)..."
|
|
pip install evalplus 2>&1 | tail -3
|
|
log_success "evalplus installed"
|
|
fi
|
|
|
|
# BigCodeBench
|
|
if python3 -c "import bigcodebench" 2>/dev/null; then
|
|
log_success "bigcodebench already installed"
|
|
else
|
|
log_info "Installing bigcodebench..."
|
|
pip install bigcodebench 2>&1 | tail -3
|
|
log_success "bigcodebench installed"
|
|
fi
|
|
|
|
# ── Check for local LLM server ──────────────────────────
|
|
log_header "LLM Server Check"
|
|
|
|
ollama_ok=false
|
|
llamacpp_ok=false
|
|
|
|
if is_cmd ollama; then
|
|
if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
|
|
log_success "ollama running at localhost:11434"
|
|
ollama_ok=true
|
|
# List available models
|
|
log_info "Available ollama models:"
|
|
ollama list 2>/dev/null | head -10 || true
|
|
else
|
|
log_warn "ollama installed but not running. Start with: ollama serve"
|
|
fi
|
|
else
|
|
log_info "ollama not installed — needed for most agentic benchmarks"
|
|
log_info "Install: curl -fsSL https://ollama.com/install.sh | sh"
|
|
fi
|
|
|
|
# Check for llama.cpp server
|
|
if curl -s http://localhost:8080/health >/dev/null 2>&1; then
|
|
log_success "llama.cpp server running at localhost:8080"
|
|
llamacpp_ok=true
|
|
else
|
|
log_info "No llama.cpp server detected at localhost:8080"
|
|
log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap"
|
|
fi
|
|
|
|
if ! $ollama_ok && ! $llamacpp_ok; then
|
|
log_warn "No local LLM server running. Agentic benchmarks need one."
|
|
fi
|
|
|
|
# ── Summary ──────────────────────────────────────────────
|
|
log_header "Setup Complete"
|
|
echo ""
|
|
echo " Installed tools:"
|
|
echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)"
|
|
echo " evalplus — HumanEval+ / MBPP+ with native ollama support"
|
|
echo " bigcodebench — 1,140 coding tasks across 139 libraries"
|
|
echo ""
|
|
echo " To activate the virtual environment:"
|
|
echo " source data/venv/bin/activate"
|
|
echo ""
|
|
echo " Run evaluations:"
|
|
echo " make agentic-quick # EvalPlus + IFEval (~1 hour)"
|
|
echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)"
|
|
echo ""
|