diff --git a/.gitignore b/.gitignore index 3e316a3..4c96559 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ data/ +.venv/ *.log *.csv *.tmp diff --git a/CLAUDE.md b/CLAUDE.md index bb671cf..2f6cfc0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,9 +41,21 @@ make verify # 9-point optimization checklist bin/audit --json | python3 -m json.tool # Verify JSON output is valid ``` +## Serving + +`scripts/serve/launch.sh` with dispatcher at `bin/serve`. Launches llama-server inside toolbox containers with optimized defaults: Vulkan RADV, q4_0 KV cache, flash attention, no-mmap, full GPU offload. Key flags: +- `--ngram` — n-gram speculative decoding (~1.1-1.4x tg for repetitive content) +- `--no-think` — disables thinking/reasoning via `--reasoning-budget 0` (faster for evals) +- `--ctx N` — context size (default 131072) +- `--parallel N` — concurrent request slots + +## System Tuning + +`scripts/optimize/power-profile.sh` applies Phase 2 optimizations: RyzenAdj PPT increase (85W target, HP caps at 70W sustained), sysctl tuning (vm.swappiness=1, vm.max_map_count=500000), THP=always, RADV_PERFTEST=nogttspill. Systemd services for boot/resume persistence at `configs/ryzenadj-llm.service` and `configs/ryzenadj-resume.service`. + ## Agentic Evaluation -Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `data/venv/`. Eval frameworks: inspect-ai (all-in-one), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint (ollama or llama.cpp server). Model catalog at `configs/models.conf`. +Scripts in `scripts/agentic/` with dispatcher at `bin/agentic`. Uses a Python venv at `.venv/` (Python 3.13, dependencies in `requirements.txt`). Eval frameworks: inspect-ai (all-in-one), inspect-evals (task definitions), evalplus (HumanEval+/MBPP+), bigcodebench. All target an OpenAI-compatible endpoint — auto-detects llama-server (port 8080) or ollama (port 11434). Model catalog at `configs/models.conf`. ## External Resources diff --git a/bin/benchmark b/bin/benchmark index 6df7533..ba36963 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -23,6 +23,7 @@ case "${1:-help}" in echo " --category LIST Comma-separated: smoke,dense,moe" echo " --skip-longctx Skip long-context (32K) tests" echo " --reps N Standard test repetitions (default: 5)" + echo " -b, --batch N Batch size (default: 2048, try 256 for MoE)" echo " --kv-types LIST KV cache sweep (e.g. f16,q8_0,q4_0 or q4_0:q8_0)" echo "" echo "Examples:" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b019321 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +# Agentic evaluation frameworks +# Install: python3.13 -m venv .venv && source .venv/bin/activate && pip install -r requirements.txt +# Requires Python >=3.10, <3.14 (bigcodebench constraint) + +inspect-ai>=0.3.201 +inspect-evals>=0.6.0 +evalplus>=0.3.1 +bigcodebench>=0.2.5 +openai>=2.26.0 + +# IFEval dependency (not on PyPI) +instruction_following_eval @ git+https://github.com/josejg/instruction_following_eval diff --git a/scripts/agentic/run-eval.sh b/scripts/agentic/run-eval.sh index 27803c8..d0ddcf3 100644 --- a/scripts/agentic/run-eval.sh +++ b/scripts/agentic/run-eval.sh @@ -5,7 +5,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../../lib/common.sh" -VENV_DIR="$(data_dir venv)" +VENV_DIR="$PROJECT_ROOT/.venv" EVAL_DIR="$(data_dir evals)" # ── Argument parsing ───────────────────────────────────── @@ -37,33 +37,59 @@ while [[ $# -gt 0 ]]; do done # ── Validation ─────────────────────────────────────────── -if [[ -z "$MODEL" ]]; then - log_error "Model name required. Use --model NAME" - log_info "Examples:" - log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)" - log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)" - exit 1 -fi - if [[ ! -f "$VENV_DIR/bin/activate" ]]; then log_error "Virtual environment not found. Run: make agentic-setup" exit 1 fi source "$VENV_DIR/bin/activate" -# Check server is reachable -if ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then - # Try ollama native endpoint - if curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then - log_info "Ollama detected, using OpenAI-compat endpoint" +# Auto-detect server if no explicit endpoint given +if [[ "$ENDPOINT" == "http://localhost:11434/v1" ]]; then + if curl -sf "http://localhost:8080/health" >/dev/null 2>&1; then + ENDPOINT="http://localhost:8080/v1" + log_info "Auto-detected llama-server at localhost:8080" + elif curl -sf "http://localhost:11434/api/tags" >/dev/null 2>&1; then + log_info "Auto-detected ollama at localhost:11434" else - log_error "No LLM server at $ENDPOINT. Start ollama or llama.cpp server first." + log_error "No LLM server found. Start one first:" + log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)" + log_info " ollama serve (ollama)" + exit 1 + fi +else + if ! curl -sf "${ENDPOINT%/v1}/health" >/dev/null 2>&1 && \ + ! curl -sf "$ENDPOINT/models" >/dev/null 2>&1; then + log_error "No LLM server at $ENDPOINT" + exit 1 + fi +fi + +# Auto-detect model name from server if not provided +if [[ -z "$MODEL" ]]; then + DETECTED_MODEL=$(curl -sf "$ENDPOINT/models" 2>/dev/null | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + models = data.get('data', []) + if models: + print(models[0].get('id', '')) +except: pass +" 2>/dev/null || true) + if [[ -n "$DETECTED_MODEL" ]]; then + MODEL="$DETECTED_MODEL" + log_info "Auto-detected model: $MODEL" + else + log_error "Model name required. Use --model NAME" + log_info "Examples:" + log_info " --model qwen3.5:35b-a3b-q8_0 (ollama)" + log_info " --model Qwen3.5-35B-A3B-Q8_0 (llama.cpp server)" exit 1 fi fi TS="$(timestamp)" -RUN_DIR="$EVAL_DIR/${SUITE}-${MODEL//[:\/]/_}-${TS}" +SAFE_MODEL="$(echo "$MODEL" | tr -cs 'a-zA-Z0-9._-' '_')" +RUN_DIR="$EVAL_DIR/${SUITE}-${SAFE_MODEL}-${TS}" mkdir -p "$RUN_DIR" log_header "Agentic Evaluation: $SUITE" @@ -86,7 +112,11 @@ ENDJSON METRICS_FILE="$RUN_DIR/metrics.csv" bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 5 & METRICS_PID=$! -trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT +cleanup() { + kill "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +} +trap 'cleanup; exit 0' EXIT # ── Suite execution ────────────────────────────────────── @@ -113,14 +143,14 @@ run_evalplus() { run_inspect_eval() { local eval_name="$1" local display_name="$2" + local safe_name="${eval_name//\//_}" # inspect_evals/ifeval → inspect_evals_ifeval log_info "Running Inspect AI: $display_name..." - local out="$RUN_DIR/inspect-${eval_name}.json" OPENAI_BASE_URL="$ENDPOINT" OPENAI_API_KEY="not-needed" \ inspect eval "$eval_name" \ --model "openai/$MODEL" \ --log-dir "$RUN_DIR/inspect-logs/" \ - 2>&1 | tee "$RUN_DIR/inspect-${eval_name}.log" + 2>&1 | tee "$RUN_DIR/inspect-${safe_name}.log" log_success "Inspect $display_name complete" } @@ -138,7 +168,7 @@ run_bigcodebench() { case "$SUITE" in quick) run_evalplus "humaneval" - run_inspect_eval "ifeval" "IFEval (instruction following)" + run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)" ;; code) run_evalplus "humaneval" @@ -146,13 +176,13 @@ case "$SUITE" in run_bigcodebench ;; tooluse) - run_inspect_eval "bfcl" "BFCL (function calling)" + run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)" ;; full) run_evalplus "humaneval" run_evalplus "mbpp" - run_inspect_eval "ifeval" "IFEval (instruction following)" - run_inspect_eval "bfcl" "BFCL (function calling)" + run_inspect_eval "inspect_evals/ifeval" "IFEval (instruction following)" + run_inspect_eval "inspect_evals/bfcl" "BFCL (function calling)" run_bigcodebench ;; *) diff --git a/scripts/agentic/setup.sh b/scripts/agentic/setup.sh index 8839129..4b47516 100644 --- a/scripts/agentic/setup.sh +++ b/scripts/agentic/setup.sh @@ -8,91 +8,56 @@ source "$SCRIPT_DIR/../../lib/common.sh" log_header "Agentic Evaluation Setup" # ── Python virtual environment ─────────────────────────── -VENV_DIR="$(data_dir venv)" +VENV_DIR="$PROJECT_ROOT/.venv" +REQUIREMENTS="$PROJECT_ROOT/requirements.txt" + if [[ ! -f "$VENV_DIR/bin/activate" ]]; then - log_info "Creating Python virtual environment..." - python3 -m venv "$VENV_DIR" + # Prefer Python 3.13 (bigcodebench requires <3.14) + PYTHON_BIN="python3.13" + if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + PYTHON_BIN="python3" + log_warn "python3.13 not found, using $(python3 --version). bigcodebench may not install." + fi + log_info "Creating virtual environment with $($PYTHON_BIN --version)..." + "$PYTHON_BIN" -m venv "$VENV_DIR" log_success "Virtual environment created at $VENV_DIR" fi source "$VENV_DIR/bin/activate" log_info "Python: $(python3 --version) from $VENV_DIR" -# ── Install evaluation frameworks ──────────────────────── - -# Inspect AI — the all-in-one eval framework (bundles BFCL, GAIA, HumanEval, IFEval, etc.) -if python3 -c "import inspect_ai" 2>/dev/null; then - log_success "inspect-ai already installed" -else - log_info "Installing inspect-ai (main eval framework)..." - pip install inspect-ai 2>&1 | tail -3 - log_success "inspect-ai installed" -fi - -# EvalPlus — HumanEval+ and MBPP+ with native ollama support -if python3 -c "import evalplus" 2>/dev/null; then - log_success "evalplus already installed" -else - log_info "Installing evalplus (code generation benchmarks)..." - pip install evalplus 2>&1 | tail -3 - log_success "evalplus installed" -fi - -# BigCodeBench -if python3 -c "import bigcodebench" 2>/dev/null; then - log_success "bigcodebench already installed" -else - log_info "Installing bigcodebench..." - pip install bigcodebench 2>&1 | tail -3 - log_success "bigcodebench installed" -fi +# ── Install from requirements.txt ──────────────────────── +log_info "Installing dependencies from requirements.txt..." +pip install -r "$REQUIREMENTS" 2>&1 | tail -5 +log_success "Dependencies installed" # ── Check for local LLM server ────────────────────────── log_header "LLM Server Check" -ollama_ok=false -llamacpp_ok=false - -if is_cmd ollama; then - if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then - log_success "ollama running at localhost:11434" - ollama_ok=true - # List available models - log_info "Available ollama models:" - ollama list 2>/dev/null | head -10 || true - else - log_warn "ollama installed but not running. Start with: ollama serve" - fi +if curl -sf http://localhost:8080/health >/dev/null 2>&1; then + log_success "llama-server running at localhost:8080" +elif curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then + log_success "ollama running at localhost:11434" else - log_info "ollama not installed — needed for most agentic benchmarks" - log_info "Install: curl -fsSL https://ollama.com/install.sh | sh" -fi - -# Check for llama.cpp server -if curl -s http://localhost:8080/health >/dev/null 2>&1; then - log_success "llama.cpp server running at localhost:8080" - llamacpp_ok=true -else - log_info "No llama.cpp server detected at localhost:8080" - log_info "Start with: toolbox run -c llama-vulkan-radv -- llama-server -m MODEL -c 8192 -ngl 99 -fa 1 --no-mmap" -fi - -if ! $ollama_ok && ! $llamacpp_ok; then - log_warn "No local LLM server running. Agentic benchmarks need one." + log_warn "No local LLM server running. Start one before running evals:" + log_info " make serve ARGS=\"-m MODEL.gguf\" (llama-server)" + log_info " ollama serve (ollama)" fi # ── Summary ────────────────────────────────────────────── log_header "Setup Complete" echo "" echo " Installed tools:" -echo " inspect-ai — All-in-one eval framework (HumanEval, BFCL, IFEval, GAIA, ...)" -echo " evalplus — HumanEval+ / MBPP+ with native ollama support" -echo " bigcodebench — 1,140 coding tasks across 139 libraries" +echo " inspect-ai — All-in-one eval framework (IFEval, BFCL, GAIA, ...)" +echo " inspect-evals — Task definitions for inspect-ai" +echo " evalplus — HumanEval+ / MBPP+ with native ollama support" +echo " bigcodebench — 1,140 coding tasks across 139 libraries" echo "" -echo " To activate the virtual environment:" -echo " source data/venv/bin/activate" +echo " Activate venv: source .venv/bin/activate" echo "" echo " Run evaluations:" -echo " make agentic-quick # EvalPlus + IFEval (~1 hour)" -echo " make agentic-full # BFCL + BigCodeBench (~3-4 hours)" +echo " make agentic-quick # EvalPlus HumanEval+ + IFEval (~1 hour)" +echo " make agentic-code # EvalPlus + BigCodeBench (~2-3 hours)" +echo " make agentic-tooluse # BFCL function calling (~1-2 hours)" +echo " make agentic-full # All of the above (~5-6 hours)" echo "" diff --git a/scripts/benchmark/run-baseline.sh b/scripts/benchmark/run-baseline.sh index 99567d0..ebe7c6b 100644 --- a/scripts/benchmark/run-baseline.sh +++ b/scripts/benchmark/run-baseline.sh @@ -270,7 +270,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" - -r "$REPS_LONGCTX" "${KV_ARGS[@]}") + -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}") printf " cmd: %s\n" "${CMD_LC[*]}" if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then diff --git a/scripts/benchmark/run-suite.sh b/scripts/benchmark/run-suite.sh index bd3f4f0..d5ff65f 100644 --- a/scripts/benchmark/run-suite.sh +++ b/scripts/benchmark/run-suite.sh @@ -252,7 +252,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" -ngl 99 -mmp 0 -m "$TOOLBOX_MODEL_PATH" -fa 1 - -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${KV_ARGS[@]}") + -p "$CTX_PROMPT" -n 32 -d "$CTX_DEPTH" -ub "$UB_SIZE" -r "$REPS_LONGCTX" "${BATCH_ARGS[@]}" "${KV_ARGS[@]}") if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then log_success "Done"; tail -3 "$OUT_LC" else diff --git a/scripts/serve/launch.sh b/scripts/serve/launch.sh index b3513ff..7645cb5 100755 --- a/scripts/serve/launch.sh +++ b/scripts/serve/launch.sh @@ -14,6 +14,7 @@ CTX_SIZE=131072 PARALLEL=1 MODEL="" NGRAM=false +NO_THINK=false while [[ $# -gt 0 ]]; do case "$1" in @@ -23,6 +24,7 @@ while [[ $# -gt 0 ]]; do --ctx) CTX_SIZE="$2"; shift 2 ;; --parallel) PARALLEL="$2"; shift 2 ;; --ngram) NGRAM=true; shift ;; + --no-think) NO_THINK=true; shift ;; --help|-h) echo "Usage: launch.sh [OPTIONS]" echo "" @@ -33,6 +35,7 @@ while [[ $# -gt 0 ]]; do echo " --ctx N Context size (default: 131072)" echo " --parallel N Parallel request slots (default: 1)" echo " --ngram Enable n-gram speculative decoding (~1.1-1.4x tg)" + echo " --no-think Disable thinking/reasoning (faster for evals)" echo "" echo "Presets (pass model filename):" echo " Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf General purpose daily driver" @@ -101,7 +104,7 @@ fi SERVER_ARGS=( -ngl 99 # Full GPU offload --no-mmap # Direct load, no mmap overhead - -fa # Flash attention + -fa on # Flash attention -m "$TOOLBOX_MODEL_PATH" -c "$CTX_SIZE" # Context size --cache-type-k q4_0 # KV cache quantization (fastest on Vulkan) @@ -110,6 +113,11 @@ SERVER_ARGS=( -np "$PARALLEL" # Parallel slots ) +# Disable thinking mode (faster for evals) +if $NO_THINK; then + SERVER_ARGS+=(--reasoning-budget 0) +fi + # N-gram speculative decoding if $NGRAM; then SERVER_ARGS+=( @@ -126,6 +134,7 @@ log_info "Backend: $BACKEND" log_info "Context: $CTX_SIZE tokens" log_info "KV cache: q4_0/q4_0" log_info "Parallel slots: $PARALLEL" +$NO_THINK && log_info "Thinking mode: DISABLED (--reasoning-budget 0)" $NGRAM && log_info "N-gram speculative: enabled (draft-max=64)" log_info "Port: $PORT" log_info "Endpoint: http://localhost:$PORT" diff --git a/tests/benchmark_flags.bats b/tests/benchmark_flags.bats index 9bc0f2d..d2ed888 100644 --- a/tests/benchmark_flags.bats +++ b/tests/benchmark_flags.bats @@ -11,6 +11,7 @@ load test_helper.sh assert_output --partial "--category" assert_output --partial "--skip-longctx" assert_output --partial "--kv-types" + assert_output --partial "--batch" } @test "run-suite --help shows usage and exits 0" { @@ -22,6 +23,7 @@ load test_helper.sh assert_output --partial "--skip-longctx" assert_output --partial "--tag" assert_output --partial "--kv-types" + assert_output --partial "--batch" } @test "benchmark dispatcher shows help with no args" { @@ -31,6 +33,18 @@ load test_helper.sh assert_output --partial "--max-size" assert_output --partial "--skip-longctx" assert_output --partial "--kv-types" + assert_output --partial "--batch" +} + +@test "serve --help shows usage and exits 0" { + run bash "$PROJECT_ROOT/bin/serve" --help + assert_success + assert_output --partial "Usage" + assert_output --partial "--model" + assert_output --partial "--ngram" + assert_output --partial "--no-think" + assert_output --partial "--ctx" + assert_output --partial "--port" } @test "benchmark dispatcher passes --help through to baseline" {