Initial commit

This commit is contained in:
Felipe Cardoso
2026-03-25 20:13:15 +01:00
commit c596e38e9e
26 changed files with 2345 additions and 0 deletions

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env bash
# Quick-glance system audit — single screen status overview
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
# ── Gather data ──────────────────────────────────────────
cpu_model="$(detect_cpu_model)"
cpu_threads="$(detect_cpu_cores)"
cpu_physical="$(detect_cpu_physical)"
gpu_name="$(detect_gpu_name)"
kernel="$(detect_kernel_version)"
firmware="$(detect_firmware_version)"
vram_total="$(detect_vram_total)"
vram_used="$(detect_vram_used)"
gtt_total="$(detect_gtt_total)"
gtt_used="$(detect_gtt_used)"
ram_kb="$(detect_system_ram_kb)"
ram_bytes=$(( ram_kb * 1024 ))
param_iommu="$(detect_kernel_param 'iommu')"
param_gttsize="$(detect_gttsize_param)"
param_pages="$(detect_pages_limit_param)"
tuned="$(detect_tuned_profile)"
rocm_ver="$(detect_rocm_version)"
vulkan_drv="$(detect_vulkan_driver)"
vulkan_ver="$(detect_vulkan_version)"
rec_gttsize="$(recommended_gttsize_mib)"
rec_pages="$(recommended_pages_limit)"
# ── Score tracking ───────────────────────────────────────
score=0
total=0
check() {
local pass="$1" label="$2" detail="$3"
total=$(( total + 1 ))
if [[ "$pass" == "1" ]]; then
score=$(( score + 1 ))
print_status pass "$label" "$detail"
else
print_status fail "$label" "$detail"
fi
}
check_warn() {
local label="$1" detail="$2"
print_status warn "$label" "$detail"
}
check_info() {
local label="$1" detail="$2"
print_status info "$label" "$detail"
}
# ── Output ───────────────────────────────────────────────
printf "\n${BOLD}${CYAN}"
cat << 'BANNER'
╔═══════════════════════════════════════════╗
║ AMD Strix Halo — System Status ║
╚═══════════════════════════════════════════╝
BANNER
printf "${RESET}"
# Hardware
log_header "Hardware"
print_kv "CPU" "$cpu_model (${cpu_physical}C/${cpu_threads}T)"
print_kv "GPU" "$gpu_name"
print_kv "System RAM (visible)" "$(human_bytes "$ram_bytes")"
# Kernel & Firmware
log_header "Kernel & Firmware"
kernel_major=$(echo "$kernel" | cut -d. -f1)
kernel_minor=$(echo "$kernel" | cut -d. -f2)
kernel_ok=0
if (( kernel_major > 6 )) || (( kernel_major == 6 && kernel_minor >= 18 )); then
kernel_ok=1
fi
check "$kernel_ok" "Kernel version" "$kernel (need >= 6.18.4)"
firmware_ok=1
firmware_note="$firmware"
if detect_firmware_bad; then
firmware_ok=0
firmware_note="$firmware (KNOWN BAD — causes ROCm crashes!)"
fi
check "$firmware_ok" "Firmware" "$firmware_note"
# Memory allocation
log_header "Memory Allocation"
vram_gib=$(echo "scale=1; $vram_total / 1073741824" | bc)
gtt_gib=$(echo "scale=1; $gtt_total / 1073741824" | bc)
# VRAM: should be <= 1 GiB (ideally 0.5 GiB)
vram_ok=0
(( vram_total <= 1073741824 )) && vram_ok=1
check "$vram_ok" "VRAM (dedicated)" "${vram_gib} GiB$([ "$vram_ok" -eq 0 ] && echo " — should be 0.5 GiB in BIOS")"
# GTT: should be close to recommended (at least 75%)
gtt_rec_bytes=$(( rec_gttsize * 1048576 ))
gtt_ok=0
(( gtt_total >= gtt_rec_bytes * 3 / 4 )) && gtt_ok=1
check "$gtt_ok" "GTT (dynamic)" "${gtt_gib} GiB$([ "$gtt_ok" -eq 0 ] && echo " — should be ~$(human_mib "$rec_gttsize") with kernel params")"
print_kv "VRAM in use" "$(human_bytes "$vram_used")"
print_kv "GTT in use" "$(human_bytes "$gtt_used")"
# Kernel boot parameters
log_header "Kernel Boot Parameters"
iommu_ok=0
[[ "$param_iommu" == "pt" ]] && iommu_ok=1
check "$iommu_ok" "iommu=pt" "$([ -n "$param_iommu" ] && echo "current: $param_iommu" || echo "MISSING")"
gtt_param_ok=0
[[ -n "$param_gttsize" ]] && gtt_param_ok=1
check "$gtt_param_ok" "amdgpu.gttsize" "$([ -n "$param_gttsize" ] && echo "current: ${param_gttsize} MiB" || echo "MISSING — recommended: ${rec_gttsize}")"
pages_ok=0
[[ -n "$param_pages" ]] && pages_ok=1
check "$pages_ok" "ttm.pages_limit" "$([ -n "$param_pages" ] && echo "current: $param_pages" || echo "MISSING — recommended: ${rec_pages}")"
# Tuned profile
log_header "Performance Profile"
tuned_ok=0
[[ "$tuned" == "accelerator-performance" ]] && tuned_ok=1
check "$tuned_ok" "Tuned profile" "$tuned$([ "$tuned_ok" -eq 0 ] && echo " — recommended: accelerator-performance")"
# Software stack
log_header "Software Stack"
check_info "ROCm" "$rocm_ver"
check_info "Vulkan" "$vulkan_drv $vulkan_ver"
# Toolboxes
toolbox_count=0
if is_cmd toolbox; then
toolbox_count=$(detect_toolbox_names | wc -l)
fi
if (( toolbox_count > 0 )); then
check_info "Toolbox containers" "$toolbox_count available"
detect_toolbox_names | while read -r name; do
printf " ${DIM}%s${RESET}\n" "$name"
done
else
check_warn "Toolbox containers" "none — run 'make benchmark-setup'"
fi
# LLM stacks
log_header "LLM Stacks"
check_info "LM Studio" "$(detect_stack_lmstudio)"
check_info "opencode" "$(detect_stack_opencode)"
check_info "ollama" "$(detect_stack_ollama)"
check_info "llama.cpp (native)" "$(detect_stack_llamacpp)"
# Sensors
log_header "Current Sensors"
gpu_temp="$(detect_gpu_temp)"
gpu_power="$(detect_gpu_power)"
gpu_busy="$(detect_gpu_busy)"
print_kv "GPU Temperature" "$(echo "scale=1; $gpu_temp / 1000" | bc) C"
print_kv "GPU Power" "$(echo "scale=1; $gpu_power / 1000000" | bc) W"
print_kv "GPU Utilization" "${gpu_busy}%"
# Overall score
log_header "Optimization Score"
printf "\n ${BOLD}%d / %d${RESET} checks passing\n" "$score" "$total"
if (( score == total )); then
printf " ${GREEN}System is fully optimized!${RESET}\n"
elif (( score >= total / 2 )); then
printf " ${YELLOW}Partially optimized — run 'make optimize' for improvements${RESET}\n"
else
printf " ${RED}Significant optimizations available — run 'make optimize'${RESET}\n"
fi
echo ""

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env bash
# Full system report — detailed audit with JSON + text output
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
JSON_ONLY=false
[[ "${1:-}" == "--json" ]] && JSON_ONLY=true
# ── Gather all data ──────────────────────────────────────
ts="$(timestamp)"
cpu_model="$(detect_cpu_model)"
cpu_threads="$(detect_cpu_cores)"
cpu_physical="$(detect_cpu_physical)"
gpu_name="$(detect_gpu_name)"
gpu_device_id="$(detect_gpu_device_id)"
kernel="$(detect_kernel_version)"
firmware="$(detect_firmware_version)"
vram_total="$(detect_vram_total)"
vram_used="$(detect_vram_used)"
gtt_total="$(detect_gtt_total)"
gtt_used="$(detect_gtt_used)"
ram_kb="$(detect_system_ram_kb)"
param_iommu="$(detect_kernel_param 'iommu')"
param_gttsize="$(detect_gttsize_param)"
param_pages="$(detect_pages_limit_param)"
cmdline="$(cat /proc/cmdline)"
tuned="$(detect_tuned_profile)"
rocm_ver="$(detect_rocm_version)"
vulkan_drv="$(detect_vulkan_driver)"
vulkan_ver="$(detect_vulkan_version)"
gpu_temp="$(detect_gpu_temp)"
gpu_power="$(detect_gpu_power)"
gpu_busy="$(detect_gpu_busy)"
rec_gttsize="$(recommended_gttsize_mib)"
rec_pages="$(recommended_pages_limit)"
# Toolbox list
toolboxes_json="[]"
if is_cmd toolbox; then
toolboxes_json="$(detect_toolbox_names | jq -R . | jq -s . 2>/dev/null || echo '[]')"
fi
# LLM stacks
stack_ollama="$(detect_stack_ollama)"
stack_lmstudio="$(detect_stack_lmstudio)"
stack_llamacpp="$(detect_stack_llamacpp)"
stack_opencode="$(detect_stack_opencode)"
# ROCm packages
rocm_pkgs="$(detect_rocm_packages | head -30)"
# ── Build JSON (all data via env vars — no shell interpolation into Python) ──
json_report="$(
SR_TS="$ts" \
SR_CPU_MODEL="$cpu_model" SR_CPU_CORES="$cpu_physical" SR_CPU_THREADS="$cpu_threads" \
SR_GPU_NAME="$gpu_name" SR_GPU_DEVICE_ID="$gpu_device_id" SR_RAM_KB="$ram_kb" \
SR_VRAM_TOTAL="$vram_total" SR_VRAM_USED="$vram_used" \
SR_GTT_TOTAL="$gtt_total" SR_GTT_USED="$gtt_used" \
SR_REC_GTTSIZE="$rec_gttsize" SR_REC_PAGES="$rec_pages" \
SR_KERNEL="$kernel" SR_CMDLINE="$cmdline" \
SR_PARAM_IOMMU="$param_iommu" SR_PARAM_GTTSIZE="$param_gttsize" SR_PARAM_PAGES="$param_pages" \
SR_FIRMWARE="$firmware" SR_TUNED="$tuned" SR_ROCM="$rocm_ver" \
SR_VULKAN_DRV="$vulkan_drv" SR_VULKAN_VER="${vulkan_ver:-}" \
SR_GPU_TEMP="$gpu_temp" SR_GPU_POWER="$gpu_power" SR_GPU_BUSY="$gpu_busy" \
SR_TOOLBOXES="$toolboxes_json" \
SR_STACK_OLLAMA="$stack_ollama" SR_STACK_LMSTUDIO="$stack_lmstudio" \
SR_STACK_LLAMACPP="$stack_llamacpp" SR_STACK_OPENCODE="$stack_opencode" \
python3 -c '
import json, os
e = os.environ
data = {
"timestamp": e["SR_TS"],
"hardware": {
"cpu_model": e["SR_CPU_MODEL"],
"cpu_cores": int(e["SR_CPU_CORES"]),
"cpu_threads": int(e["SR_CPU_THREADS"]),
"gpu_name": e["SR_GPU_NAME"],
"gpu_device_id": e["SR_GPU_DEVICE_ID"],
"system_ram_kb": int(e["SR_RAM_KB"]),
},
"memory": {
"vram_total_bytes": int(e["SR_VRAM_TOTAL"]),
"vram_used_bytes": int(e["SR_VRAM_USED"]),
"gtt_total_bytes": int(e["SR_GTT_TOTAL"]),
"gtt_used_bytes": int(e["SR_GTT_USED"]),
"recommended_gttsize_mib": int(e["SR_REC_GTTSIZE"]),
"recommended_pages_limit": int(e["SR_REC_PAGES"]),
},
"kernel": {
"version": e["SR_KERNEL"],
"cmdline": e["SR_CMDLINE"],
"param_iommu": e["SR_PARAM_IOMMU"],
"param_gttsize": e["SR_PARAM_GTTSIZE"],
"param_pages_limit": e["SR_PARAM_PAGES"],
},
"firmware": e["SR_FIRMWARE"],
"tuned_profile": e["SR_TUNED"],
"rocm_version": e["SR_ROCM"],
"vulkan": {
"driver": e["SR_VULKAN_DRV"],
"version": e["SR_VULKAN_VER"],
},
"sensors": {
"gpu_temp_mc": int(e["SR_GPU_TEMP"]),
"gpu_power_uw": int(e["SR_GPU_POWER"]),
"gpu_busy_pct": int(e["SR_GPU_BUSY"]),
},
"toolboxes": json.loads(e["SR_TOOLBOXES"]),
"stacks": {
"ollama": e["SR_STACK_OLLAMA"],
"lmstudio": e["SR_STACK_LMSTUDIO"],
"llamacpp": e["SR_STACK_LLAMACPP"],
"opencode": e["SR_STACK_OPENCODE"],
},
}
print(json.dumps(data, indent=2))
'
)"
if $JSON_ONLY; then
echo "$json_report" | python3 -m json.tool 2>/dev/null || echo "$json_report"
exit 0
fi
# ── Save report ──────────────────────────────────────────
audit_dir="$(data_dir audits)"
json_file="$audit_dir/report-${ts}.json"
text_file="$audit_dir/report-${ts}.txt"
echo "$json_report" | python3 -m json.tool > "$json_file" 2>/dev/null || echo "$json_report" > "$json_file"
# ── Text output (also saved) ────────────────────────────
{
printf "Strix Halo Full System Report — %s\n" "$ts"
printf "=%.0s" {1..60}; echo
printf "\nHardware:\n"
printf " CPU: %s (%sC/%sT)\n" "$cpu_model" "$cpu_physical" "$cpu_threads"
printf " GPU: %s (device: 0x%s)\n" "$gpu_name" "$gpu_device_id"
printf " RAM: %s KB\n" "$ram_kb"
printf "\nMemory Allocation:\n"
printf " VRAM total: %s (used: %s)\n" "$(human_bytes "$vram_total")" "$(human_bytes "$vram_used")"
printf " GTT total: %s (used: %s)\n" "$(human_bytes "$gtt_total")" "$(human_bytes "$gtt_used")"
printf " Recommended: gttsize=%s MiB, pages_limit=%s\n" "$rec_gttsize" "$rec_pages"
printf "\nKernel:\n"
printf " Version: %s\n" "$kernel"
printf " Firmware: %s\n" "$firmware"
printf " Cmdline: %s\n" "$cmdline"
printf " iommu: %s\n" "${param_iommu:-not set}"
printf " gttsize: %s\n" "${param_gttsize:-not set}"
printf " pages_limit:%s\n" "${param_pages:-not set}"
printf "\nPerformance:\n"
printf " Tuned: %s\n" "$tuned"
printf " GPU temp: %s C\n" "$(echo "scale=1; $gpu_temp / 1000" | bc)"
printf " GPU power: %s W\n" "$(echo "scale=1; $gpu_power / 1000000" | bc)"
printf " GPU busy: %s%%\n" "$gpu_busy"
printf "\nSoftware:\n"
printf " ROCm: %s\n" "$rocm_ver"
printf " Vulkan: %s %s\n" "$vulkan_drv" "$vulkan_ver"
printf "\nROCm Packages:\n"
echo "$rocm_pkgs" | sed 's/^/ /'
printf "\nToolboxes:\n"
if [[ "$toolboxes_json" == "[]" ]]; then
printf " none\n"
else
echo "$toolboxes_json" | python3 -c "import sys,json; [print(f' {x}') for x in json.load(sys.stdin)]" 2>/dev/null || printf " (parse error)\n"
fi
printf "\nLLM Stacks:\n"
printf " ollama: %s\n" "$stack_ollama"
printf " LM Studio: %s\n" "$stack_lmstudio"
printf " llama.cpp: %s\n" "$stack_llamacpp"
printf " opencode: %s\n" "$stack_opencode"
} | tee "$text_file"
echo ""
log_success "Report saved to:"
log_info " JSON: $json_file"
log_info " Text: $text_file"

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env bash
# Compare two benchmark runs side-by-side
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
if [[ $# -lt 2 ]]; then
echo "Usage: benchmark compare <before-dir> <after-dir>"
echo ""
echo "Examples:"
echo " bin/benchmark compare data/baselines/20260325-120000 data/benchmarks/post-opt-20260326-100000"
echo ""
echo "Available baselines:"
ls -d "$(data_dir baselines)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)"
echo ""
echo "Available benchmark runs:"
ls -d "$(data_dir benchmarks)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)"
exit 1
fi
BEFORE_DIR="$1"
AFTER_DIR="$2"
for d in "$BEFORE_DIR" "$AFTER_DIR"; do
if [[ ! -f "$d/summary.json" ]]; then
log_error "No summary.json in $d"
exit 1
fi
done
log_header "Benchmark Comparison"
# Extract timestamps from directory names
before_name="$(basename "$BEFORE_DIR")"
after_name="$(basename "$AFTER_DIR")"
log_info "Before: $before_name"
log_info "After: $after_name"
# Show system state diff if available
if [[ -f "$BEFORE_DIR/system-state.json" ]] && [[ -f "$AFTER_DIR/system-state.json" ]]; then
echo ""
python3 - "$BEFORE_DIR/system-state.json" "$AFTER_DIR/system-state.json" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
before = json.load(f)
with open(sys.argv[2]) as f:
after = json.load(f)
changes = []
# Check key config differences
b_mem = before.get("memory", {})
a_mem = after.get("memory", {})
if b_mem.get("vram_total_bytes") != a_mem.get("vram_total_bytes"):
bv = b_mem.get("vram_total_bytes", 0) / 2**30
av = a_mem.get("vram_total_bytes", 0) / 2**30
changes.append(f" VRAM: {bv:.1f} GiB -> {av:.1f} GiB")
if b_mem.get("gtt_total_bytes") != a_mem.get("gtt_total_bytes"):
bg = b_mem.get("gtt_total_bytes", 0) / 2**30
ag = a_mem.get("gtt_total_bytes", 0) / 2**30
changes.append(f" GTT: {bg:.1f} GiB -> {ag:.1f} GiB")
b_kern = before.get("kernel", {})
a_kern = after.get("kernel", {})
for param in ["param_iommu", "param_gttsize", "param_pages_limit"]:
bv = b_kern.get(param, "")
av = a_kern.get(param, "")
if bv != av:
changes.append(f" {param}: '{bv}' -> '{av}'")
bt = before.get("tuned_profile", "")
at = after.get("tuned_profile", "")
if bt != at:
changes.append(f" tuned: {bt} -> {at}")
if changes:
print(" Configuration changes:")
for c in changes:
print(c)
else:
print(" No configuration changes detected")
PYEOF
fi
# Compare results
echo ""
python3 - "$BEFORE_DIR/summary.json" "$AFTER_DIR/summary.json" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
before = json.load(f)
with open(sys.argv[2]) as f:
after = json.load(f)
# Index by (model, backend, test)
def index_results(data):
idx = {}
for r in data.get("results", []):
key = (r["model"], r["backend"], r["test"])
idx[key] = r["tokens_per_sec"]
return idx
b_idx = index_results(before)
a_idx = index_results(after)
all_keys = sorted(set(b_idx.keys()) | set(a_idx.keys()))
if not all_keys:
print(" No comparable results found.")
sys.exit(0)
fmt = " {:<18} {:<14} {:<7} {:>9} {:>9} {:>8}"
print(fmt.format("Model", "Backend", "Test", "Before", "After", "Delta"))
print(" " + "-" * 70)
for key in all_keys:
model, backend, test = key
b_val = b_idx.get(key)
a_val = a_idx.get(key)
b_str = f"{b_val:.1f}" if b_val else "—"
a_str = f"{a_val:.1f}" if a_val else "—"
if b_val and a_val:
delta_pct = (a_val - b_val) / b_val * 100
if delta_pct > 0:
d_str = f"\033[32m+{delta_pct:.1f}%\033[0m"
elif delta_pct < 0:
d_str = f"\033[31m{delta_pct:.1f}%\033[0m"
else:
d_str = "0.0%"
else:
d_str = "—"
print(fmt.format(model[:18], backend[:14], test, b_str, a_str, d_str))
print()
PYEOF

View File

@@ -0,0 +1,223 @@
#!/usr/bin/env bash
# Capture pre-optimization baseline benchmark
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
MODEL_DIR="$(data_dir models)"
TS="$(timestamp)"
RESULT_DIR="$(data_dir baselines)/$TS"
mkdir -p "$RESULT_DIR"
REPS_STANDARD=5
REPS_LONGCTX=3
log_header "Baseline Benchmark Capture"
log_info "Results will be saved to: $RESULT_DIR"
# ── 1. Save system state ────────────────────────────────
log_info "Capturing system state..."
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
# ── 2. Discover available toolboxes and models ──────────
existing="$(detect_toolbox_names 2>/dev/null || true)"
# Map toolbox names to llama-bench commands (same pattern as upstream)
declare -A BENCH_PATHS=(
[llama-vulkan-radv]="/usr/sbin/llama-bench"
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
)
available_backends=()
for tb in "${!BENCH_PATHS[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
available_backends+=("$tb")
log_success "Backend: $tb"
fi
done
if (( ${#available_backends[@]} == 0 )); then
log_error "No toolbox backends found. Run: make benchmark-setup"
exit 1
fi
# Find models
mapfile -t MODEL_PATHS < <(
find "$MODEL_DIR" -type f -name '*.gguf' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
if (( ${#MODEL_PATHS[@]} == 0 )); then
log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup"
exit 1
fi
log_info "Found ${#MODEL_PATHS[@]} model(s):"
for p in "${MODEL_PATHS[@]}"; do
printf " %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)"
done
# ── 3. Start metric logging ─────────────────────────────
METRICS_FILE="$RESULT_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
METRICS_PID=$!
log_info "Metric logger started (PID: $METRICS_PID)"
cleanup() {
kill "$METRICS_PID" 2>/dev/null || true
wait "$METRICS_PID" 2>/dev/null || true
}
trap cleanup EXIT
# ── 4. Run benchmarks ───────────────────────────────────
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for BACKEND in "${available_backends[@]}"; do
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
BACKEND_SAFE="${BACKEND//[.-]/_}"
# Build environment args for ROCm backends
ENV_ARGS=()
if [[ "$BACKEND" == *rocm* ]]; then
ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
fi
# Standard test (pp512 + tg128, default context)
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log"
if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD")
printf " cmd: %s\n" "${CMD[*]}"
if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Standard test complete"
tail -5 "$OUT"
else
log_error "Standard test failed (exit $?)"
echo "FAILED" >> "$OUT"
fi
else
log_info "Skipping standard test (log exists): $OUT"
fi
# Long-context test (pp2048, tg32, ctx 32768)
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
if [[ ! -s "$OUT_LC" ]]; then
printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME"
UB_SIZE=2048
[[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1
-p 2048 -n 32 -d 32768 -ub "$UB_SIZE"
-r "$REPS_LONGCTX")
printf " cmd: %s\n" "${CMD_LC[*]}"
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Long-context test complete"
tail -5 "$OUT_LC"
else
log_error "Long-context test failed (exit $?)"
echo "FAILED" >> "$OUT_LC"
fi
else
log_info "Skipping long-context test (log exists): $OUT_LC"
fi
done
done
# ── 5. Parse results into summary JSON ──────────────────
log_info "Parsing results..."
SUMMARY="$RESULT_DIR/summary.json"
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
import sys, os, re, json
from pathlib import Path
result_dir = Path(sys.argv[1])
results = []
for logfile in sorted(result_dir.glob("*.log")):
content = logfile.read_text()
if "FAILED" in content:
continue
# Parse the pipe-delimited llama-bench table
for line in content.splitlines():
line = line.strip()
if not line.startswith("|") or "model" in line.lower() and "size" in line.lower():
continue
if "---" in line:
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) < 10:
continue
# Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s |
try:
test_type = parts[8].strip() if len(parts) > 8 else ""
ts_raw = parts[9].strip() if len(parts) > 9 else ""
if not test_type or not ts_raw:
continue
# Parse "548.18 +/- 1.59" or just "548.18"
ts_match = re.match(r'([\d.]+)', ts_raw)
if not ts_match:
continue
results.append({
"file": logfile.name,
"model": parts[1].strip(),
"size": parts[2].strip(),
"backend": parts[4].strip(),
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"raw": ts_raw,
})
except (ValueError, IndexError):
continue
print(json.dumps({"results": results}, indent=2))
PYEOF
# ── 6. Display summary ──────────────────────────────────
log_header "Baseline Results"
python3 - "$SUMMARY" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
data = json.load(f)
if not data["results"]:
print(" No results parsed. Check log files for errors.")
sys.exit(0)
# Print table
fmt = " {:<20} {:<16} {:<8} {:>10}"
print(fmt.format("Model", "Backend", "Test", "t/s"))
print(" " + "-" * 58)
for r in data["results"]:
print(fmt.format(
r["model"][:20],
r["backend"][:16],
r["test"],
f"{r['tokens_per_sec']:.2f}"
))
PYEOF
echo ""
log_success "Baseline saved to: $RESULT_DIR"
log_info "Files: system-state.json, summary.json, metrics.csv, *.log"
log_info "Compare later with: bin/benchmark compare $RESULT_DIR <new-run-dir>"

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env bash
# Full benchmark suite — run all backends × models with tagging
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
MODEL_DIR="$(data_dir models)"
TAG="run"
BACKENDS_FILTER=""
MODELS_FILTER=""
while [[ $# -gt 0 ]]; do
case "$1" in
--tag|-t) TAG="$2"; shift 2 ;;
--backends|-b) BACKENDS_FILTER="$2"; shift 2 ;;
--models|-m) MODELS_FILTER="$2"; shift 2 ;;
*) shift ;;
esac
done
TS="$(timestamp)"
RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}"
mkdir -p "$RESULT_DIR"
REPS_STANDARD=5
REPS_LONGCTX=3
log_header "Benchmark Suite: $TAG"
log_info "Results: $RESULT_DIR"
# Save system state
bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null
# Discover backends
existing="$(detect_toolbox_names 2>/dev/null || true)"
declare -A BENCH_PATHS=(
[llama-vulkan-radv]="/usr/sbin/llama-bench"
[llama-vulkan-amdvlk]="/usr/sbin/llama-bench"
[llama-rocm-6.4.4]="/usr/local/bin/llama-bench"
[llama-rocm-7.2]="/usr/local/bin/llama-bench"
[llama-rocm7-nightlies]="/usr/local/bin/llama-bench"
)
available_backends=()
for tb in "${!BENCH_PATHS[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then
available_backends+=("$tb")
fi
fi
done
if (( ${#available_backends[@]} == 0 )); then
log_error "No matching backends. Run: make benchmark-setup"
exit 1
fi
log_info "Backends: ${available_backends[*]}"
# Find models
mapfile -t MODEL_PATHS < <(
find "$MODEL_DIR" -type f -name '*.gguf' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
if [[ -n "$MODELS_FILTER" ]]; then
filtered=()
for p in "${MODEL_PATHS[@]}"; do
name="$(basename "$p")"
if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then
filtered+=("$p")
fi
done
MODEL_PATHS=("${filtered[@]}")
fi
if (( ${#MODEL_PATHS[@]} == 0 )); then
log_error "No models found. Run: make benchmark-setup"
exit 1
fi
log_info "Models: ${#MODEL_PATHS[@]}"
# Start metric logging
METRICS_FILE="$RESULT_DIR/metrics.csv"
bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 &
METRICS_PID=$!
trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT
# Run benchmarks (same logic as run-baseline.sh)
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for BACKEND in "${available_backends[@]}"; do
BENCH_BIN="${BENCH_PATHS[$BACKEND]}"
BACKEND_SAFE="${BACKEND//[.-]/_}"
ENV_ARGS=()
[[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1)
# Standard test
OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log"
if [[ ! -s "$OUT" ]]; then
printf "\n${BOLD}>> [%s] %s — standard${RESET}\n" "$BACKEND" "$MODEL_NAME"
CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD")
if "${CMD[@]}" > "$OUT" 2>&1; then
log_success "Done"; tail -3 "$OUT"
else
log_error "Failed"; echo "FAILED" >> "$OUT"
fi
fi
# Long-context test
OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log"
if [[ ! -s "$OUT_LC" ]]; then
printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME"
UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512
CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN"
-ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1
-p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX")
if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then
log_success "Done"; tail -3 "$OUT_LC"
else
log_error "Failed"; echo "FAILED" >> "$OUT_LC"
fi
fi
done
done
# Parse results
SUMMARY="$RESULT_DIR/summary.json"
# Parse llama-bench log files into summary JSON
python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF'
import sys, os, re, json
from pathlib import Path
result_dir = Path(sys.argv[1])
results = []
for logfile in sorted(result_dir.glob("*.log")):
content = logfile.read_text()
if "FAILED" in content:
continue
for line in content.splitlines():
line = line.strip()
if not line.startswith("|") or "model" in line.lower() and "size" in line.lower():
continue
if "---" in line:
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) < 10:
continue
try:
test_type = parts[8].strip()
ts_raw = parts[9].strip()
ts_match = re.match(r'([\d.]+)', ts_raw)
if not ts_match:
continue
results.append({
"file": logfile.name,
"model": parts[1].strip(),
"size": parts[2].strip(),
"backend": parts[4].strip(),
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"raw": ts_raw,
})
except (ValueError, IndexError):
continue
print(json.dumps({"results": results}, indent=2))
PYEOF
log_header "Results"
python3 - "$SUMMARY" << 'PYEOF'
import sys, json
with open(sys.argv[1]) as f:
data = json.load(f)
if not data["results"]:
print(" No results parsed.")
sys.exit(0)
fmt = " {:<20} {:<16} {:<8} {:>10}"
print(fmt.format("Model", "Backend", "Test", "t/s"))
print(" " + "-" * 58)
for r in data["results"]:
print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}"))
PYEOF
echo ""
log_success "Results saved to: $RESULT_DIR"

106
scripts/benchmark/setup.sh Normal file
View File

@@ -0,0 +1,106 @@
#!/usr/bin/env bash
# Benchmark setup — ensure toolboxes and test models are ready
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes"
MODEL_DIR="$(data_dir models)"
log_header "Benchmark Setup"
# ── 1. Check toolbox containers ──────────────────────────
log_info "Checking toolbox containers..."
# Minimum required: vulkan-radv (most stable)
REQUIRED_TOOLBOXES=("llama-vulkan-radv")
OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-rocm-7.2" "llama-vulkan-amdvlk")
existing=$(detect_toolbox_names 2>/dev/null || true)
missing=()
for tb in "${REQUIRED_TOOLBOXES[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
log_success "Toolbox: $tb"
else
missing+=("$tb")
log_warn "Toolbox missing: $tb"
fi
done
for tb in "${OPTIONAL_TOOLBOXES[@]}"; do
if echo "$existing" | grep -q "^${tb}$"; then
log_success "Toolbox: $tb (optional)"
else
log_info "Toolbox not present: $tb (optional)"
fi
done
if (( ${#missing[@]} > 0 )); then
log_info "Need to create required toolboxes."
if [[ -d "$TOOLBOXES_REPO" ]]; then
log_info "Found toolboxes repo at: $TOOLBOXES_REPO"
if confirm "Create missing toolboxes using refresh-toolboxes.sh?"; then
for tb in "${missing[@]}"; do
log_info "Creating $tb..."
bash "$TOOLBOXES_REPO/refresh-toolboxes.sh" "$tb"
done
fi
else
log_error "Toolboxes repo not found at: $TOOLBOXES_REPO"
log_info "Clone it: git clone https://github.com/kyuz0/amd-strix-halo-toolboxes"
log_info "Then re-run this setup."
exit 1
fi
fi
# ── 2. Verify GPU access inside toolboxes ────────────────
log_info "Verifying GPU access in toolboxes..."
for tb in "${REQUIRED_TOOLBOXES[@]}"; do
if echo "$existing" | grep -qF "$tb"; then
if toolbox run -c "$tb" -- llama-cli --list-devices 2>&1 | grep -qi "gpu\|vulkan\|rocm"; then
log_success "GPU accessible in $tb"
else
log_warn "GPU may not be accessible in $tb — check device mappings"
fi
fi
done
# ── 3. Check for test models ────────────────────────────
log_info "Checking for test models in $MODEL_DIR..."
model_count=$(find "$MODEL_DIR" -name "*.gguf" 2>/dev/null | wc -l)
if (( model_count > 0 )); then
log_success "Found $model_count model(s):"
find "$MODEL_DIR" -name "*.gguf" | while read -r f; do
size=$(du -h "$f" | cut -f1)
printf " %s (%s)\n" "$(basename "$f")" "$size"
done
else
log_warn "No GGUF models found in $MODEL_DIR"
log_info "Download a test model. Example:"
echo ""
echo " # Small (4B, ~3 GB):"
echo " huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \\"
echo " --local-dir $MODEL_DIR"
echo ""
echo " # Medium (14B, ~9 GB):"
echo " huggingface-cli download Qwen/Qwen3-14B-GGUF Qwen3-14B-Q4_K_M.gguf \\"
echo " --local-dir $MODEL_DIR"
echo ""
if is_cmd huggingface-cli; then
if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as test model?"; then
huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \
--local-dir "$MODEL_DIR"
log_success "Model downloaded"
fi
else
log_info "Install huggingface-cli: pip install huggingface_hub[cli]"
fi
fi
log_header "Setup Complete"
log_info "Run 'make benchmark-baseline' to capture your baseline."

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env bash
# Tmux-based monitoring dashboard
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
SESSION="strix-monitor"
SIMPLE=false
WITH_LOG=false
while [[ $# -gt 0 ]]; do
case "$1" in
--simple|-s) SIMPLE=true; shift ;;
--with-logging|-l) WITH_LOG=true; shift ;;
*) shift ;;
esac
done
# Simple mode: just launch amdgpu_top
if $SIMPLE; then
if is_cmd amdgpu_top; then
exec amdgpu_top
elif is_cmd nvtop; then
log_warn "amdgpu_top not found, falling back to nvtop"
exec nvtop
else
log_error "No GPU monitor installed. Run: make monitor-install"
exit 1
fi
fi
# Full dashboard requires tmux
if ! is_cmd tmux; then
log_error "tmux is required for dashboard mode. Run: make monitor-install"
exit 1
fi
# Pick GPU monitor
GPU_MON="nvtop"
if is_cmd amdgpu_top; then
GPU_MON="amdgpu_top"
fi
# Pick system monitor
SYS_MON="htop"
if is_cmd btop; then
SYS_MON="btop"
elif ! is_cmd htop; then
SYS_MON="top"
fi
# Kill existing session if running
tmux kill-session -t "$SESSION" 2>/dev/null || true
# Start background logging if requested
LOG_CMD="echo Metric logging not active. Use --with-logging to enable.; read -r"
LOG_PID=""
if $WITH_LOG; then
LOG_FILE="$(data_dir logs)/metrics-$(timestamp).csv"
bash "$SCRIPT_DIR/log-metrics.sh" --output "$LOG_FILE" &
LOG_PID=$!
LOG_CMD="tail -f \"$LOG_FILE\""
log_info "Metric logger started (PID: $LOG_PID) → $LOG_FILE"
fi
# Cleanup logger on exit
cleanup() {
if [[ -n "$LOG_PID" ]]; then
kill "$LOG_PID" 2>/dev/null || true
wait "$LOG_PID" 2>/dev/null || true
fi
}
trap cleanup EXIT
# Create tmux layout
# +--------------------+--------------------+
# | GPU monitor | System monitor |
# | | |
# +--------------------------------------------+
# | Metrics log tail / status |
# +--------------------------------------------+
tmux new-session -d -s "$SESSION" -x "$(tput cols 2>/dev/null || echo 120)" -y "$(tput lines 2>/dev/null || echo 40)" "$GPU_MON"
tmux split-window -t "$SESSION" -h "$SYS_MON"
tmux split-window -t "$SESSION" -v -p 20 "$LOG_CMD"
tmux select-pane -t "$SESSION:0.0"
log_info "Dashboard started. Attach with: tmux attach -t $SESSION"
log_info "Detach with Ctrl+B then D. Kill with: tmux kill-session -t $SESSION"
tmux attach -t "$SESSION"

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env bash
# Install monitoring tools for Strix Halo
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
log_header "Monitoring Tools Installer"
# ── amdgpu_top (most important) ─────────────────────────
if is_cmd amdgpu_top; then
log_success "amdgpu_top already installed: $(amdgpu_top --version 2>&1 | head -1)"
else
log_info "Installing amdgpu_top (best AMD GPU monitor)..."
installed=false
# Method 1: Install RPM from GitHub releases (fastest, works on Fedora)
if ! $installed; then
log_info "Downloading pre-built RPM from GitHub releases..."
AMDGPU_TOP_VERSION="0.11.2"
RPM_URL="https://github.com/Umio-Yasuno/amdgpu_top/releases/download/v${AMDGPU_TOP_VERSION}/amdgpu_top-${AMDGPU_TOP_VERSION}-1.x86_64.rpm"
RPM_FILE="/tmp/amdgpu_top-${AMDGPU_TOP_VERSION}.rpm"
if curl -fsSL -o "$RPM_FILE" "$RPM_URL" 2>/dev/null; then
if sudo dnf install -y "$RPM_FILE" 2>&1; then
installed=true
log_success "amdgpu_top installed from RPM"
rm -f "$RPM_FILE"
else
log_warn "RPM install failed"
fi
else
log_warn "RPM download failed"
fi
fi
# Method 2: Try dnf repos
if ! $installed; then
log_info "Trying dnf repos..."
if sudo dnf install -y amdgpu_top 2>/dev/null; then
installed=true
log_success "amdgpu_top installed via dnf"
fi
fi
# Method 3: cargo (if available)
if ! $installed && is_cmd cargo; then
log_info "Building from source via cargo..."
if cargo install amdgpu_top 2>&1; then
installed=true
log_success "amdgpu_top installed via cargo"
else
log_warn "cargo install failed"
fi
fi
if ! $installed; then
log_warn "Could not install amdgpu_top automatically."
log_info "Manual options:"
log_info " 1. Download RPM: curl -LO $RPM_URL && sudo dnf install ./amdgpu_top-*.rpm"
log_info " 2. Download AppImage: https://github.com/Umio-Yasuno/amdgpu_top/releases/latest"
fi
fi
# ── btop ─────────────────────────────────────────────────
if is_cmd btop; then
log_success "btop already installed"
else
log_info "Installing btop..."
if sudo dnf install -y btop 2>&1; then
log_success "btop installed"
else
log_warn "Could not install btop via dnf"
fi
fi
# ── tmux (needed for dashboard) ──────────────────────────
if is_cmd tmux; then
log_success "tmux already installed"
else
log_info "Installing tmux..."
if sudo dnf install -y tmux 2>&1; then
log_success "tmux installed"
else
log_warn "Could not install tmux via dnf"
fi
fi
# ── Verify existing tools ───────────────────────────────
log_header "Monitoring Tools Status"
for tool in amdgpu_top nvtop btop amd-smi rocm-smi tmux; do
if is_cmd "$tool"; then
log_success "$tool"
else
log_warn "$tool — not installed"
fi
done

View File

@@ -0,0 +1,127 @@
#!/usr/bin/env bash
# Background metric collector — samples GPU and system stats to CSV
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
INTERVAL=2
OUTPUT=""
DURATION=0 # 0 = indefinite
while [[ $# -gt 0 ]]; do
case "$1" in
--output|-o) OUTPUT="$2"; shift 2 ;;
--interval|-i) INTERVAL="$2"; shift 2 ;;
--duration|-d) DURATION="$2"; shift 2 ;;
--help|-h)
echo "Usage: log-metrics.sh [--output FILE] [--interval SECS] [--duration SECS]"
exit 0 ;;
*) log_warn "Unknown argument: $1"; shift ;;
esac
done
# Validate numeric args
[[ "$INTERVAL" =~ ^[0-9]+$ ]] || { log_error "--interval must be a positive integer"; exit 1; }
[[ "$DURATION" =~ ^[0-9]+$ ]] || { log_error "--duration must be a positive integer"; exit 1; }
if [[ -z "$OUTPUT" ]]; then
OUTPUT="$(data_dir logs)/metrics-$(timestamp).csv"
fi
mkdir -p "$(dirname "$OUTPUT")"
# Cache sysfs paths once (avoid re-globbing every iteration)
SYSFS_GPU_BUSY="$GPU_SYSFS/gpu_busy_percent"
SYSFS_VRAM_USED="$GPU_SYSFS/mem_info_vram_used"
SYSFS_GTT_USED="$GPU_SYSFS/mem_info_gtt_used"
SYSFS_TEMP=""
SYSFS_POWER=""
for f in "$GPU_SYSFS"/hwmon/hwmon*/temp1_input; do
[[ -f "$f" ]] && SYSFS_TEMP="$f" && break
done
for f in "$GPU_SYSFS"/hwmon/hwmon*/power1_average; do
[[ -f "$f" ]] && SYSFS_POWER="$f" && break
done
# Write CSV header
echo "timestamp,gpu_busy_pct,vram_used_mib,gtt_used_mib,gpu_temp_c,gpu_power_w,cpu_pct,ram_used_mib" > "$OUTPUT"
log_info "Logging metrics every ${INTERVAL}s → $OUTPUT"
[[ $DURATION -gt 0 ]] && log_info "Will stop after ${DURATION}s"
start_time=$SECONDS
stopped=false
cleanup() {
$stopped && return
stopped=true
local lines
lines=$(( $(wc -l < "$OUTPUT") - 1 ))
log_info "Metric logger stopped. $lines samples in $OUTPUT"
}
trap cleanup EXIT
# Read /proc/stat fields into variables using bash builtins
read_cpu_stat() {
local line
read -r line < /proc/stat
# "cpu user nice system idle iowait irq softirq steal"
set -- $line
shift # drop "cpu"
CPU_TOTAL=$(( $1 + $2 + $3 + $4 + $5 + $6 + $7 + ${8:-0} ))
CPU_IDLE=$4
}
while true; do
ts="$(printf '%(%Y-%m-%d %H:%M:%S)T' -1)"
# GPU metrics — direct reads, no subshells
read -r gpu_busy < "$SYSFS_GPU_BUSY" 2>/dev/null || gpu_busy=0
read -r vram_bytes < "$SYSFS_VRAM_USED" 2>/dev/null || vram_bytes=0
read -r gtt_bytes < "$SYSFS_GTT_USED" 2>/dev/null || gtt_bytes=0
read -r temp_mc < "$SYSFS_TEMP" 2>/dev/null || temp_mc=0
read -r power_uw < "$SYSFS_POWER" 2>/dev/null || power_uw=0
vram_mib=$(( vram_bytes / 1048576 ))
gtt_mib=$(( gtt_bytes / 1048576 ))
gpu_temp_c=$(( temp_mc / 1000 )).$(( (temp_mc % 1000) / 100 ))
gpu_power_w=$(( power_uw / 1000000 )).$(( (power_uw % 1000000) / 100000 ))
# CPU usage (snapshot delta)
read_cpu_stat
prev_total=$CPU_TOTAL
prev_idle=$CPU_IDLE
sleep 0.1
read_cpu_stat
delta_total=$(( CPU_TOTAL - prev_total ))
delta_idle=$(( CPU_IDLE - prev_idle ))
if (( delta_total > 0 )); then
cpu_pct=$(( (delta_total - delta_idle) * 1000 / delta_total ))
# Format N as N/10 . N%10, handling single-digit values (e.g., 5 → 0.5)
cpu_pct_fmt="$(( cpu_pct / 10 )).$(( cpu_pct % 10 ))"
else
cpu_pct_fmt="0.0"
fi
# RAM used (bash builtins only)
local_mem_total=0
local_mem_avail=0
while IFS=': ' read -r key val _; do
case "$key" in
MemTotal) local_mem_total=$val ;;
MemAvailable) local_mem_avail=$val; break ;;
esac
done < /proc/meminfo
ram_used_mib=$(( (local_mem_total - local_mem_avail) / 1024 ))
echo "$ts,$gpu_busy,$vram_mib,$gtt_mib,$gpu_temp_c,$gpu_power_w,$cpu_pct_fmt,$ram_used_mib" >> "$OUTPUT"
# Check duration
if (( DURATION > 0 && SECONDS - start_time >= DURATION )); then
break
fi
sleep "$INTERVAL"
done

View File

@@ -0,0 +1,149 @@
#!/usr/bin/env bash
# Configure kernel boot parameters for unified memory optimization
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
GRUB_FILE="/etc/default/grub"
log_header "Kernel Boot Parameter Optimization"
# ── Check root early ────────────────────────────────────
if [[ $EUID -ne 0 ]]; then
log_error "This script requires root. Re-run with: sudo make optimize-kernel"
exit 1
fi
# ── Show current state ───────────────────────────────────
log_info "Current kernel command line:"
printf " ${DIM}%s${RESET}\n" "$(cat /proc/cmdline)"
echo ""
param_iommu="$(detect_kernel_param 'iommu')"
param_gttsize="$(detect_gttsize_param)"
param_pages="$(detect_pages_limit_param)"
rec_gttsize="$(recommended_gttsize_mib)"
rec_pages="$(recommended_pages_limit)"
# ── Check what's needed ──────────────────────────────────
needs_change=false
log_info "Parameter status:"
if [[ "$param_iommu" == "pt" ]]; then
print_status pass "iommu=pt" "already set"
else
print_status fail "iommu=pt" "$([ -n "$param_iommu" ] && echo "current: $param_iommu" || echo "missing")"
needs_change=true
fi
if [[ -n "$param_gttsize" ]] && (( param_gttsize >= rec_gttsize )); then
print_status pass "amdgpu.gttsize" "current: $param_gttsize MiB"
else
print_status fail "amdgpu.gttsize" "$([ -n "$param_gttsize" ] && echo "current: $param_gttsize MiB, " || echo "missing, ")recommended: $rec_gttsize MiB (~$(human_mib "$rec_gttsize"))"
needs_change=true
fi
if [[ -n "$param_pages" ]] && (( param_pages >= rec_pages )); then
print_status pass "ttm.pages_limit" "current: $param_pages"
else
print_status fail "ttm.pages_limit" "$([ -n "$param_pages" ] && echo "current: $param_pages, " || echo "missing, ")recommended: $rec_pages"
needs_change=true
fi
if ! $needs_change; then
echo ""
log_success "All kernel parameters are already optimal!"
exit 0
fi
# ── Explain what we're doing ─────────────────────────────
echo ""
log_info "These parameters enable unified memory for the integrated GPU:"
echo " iommu=pt IOMMU passthrough — reduces memory access latency"
echo " amdgpu.gttsize=$rec_gttsize GPU can dynamically access ~$(human_mib "$rec_gttsize") system RAM"
echo " ttm.pages_limit=$rec_pages Pin limit for GPU memory pages ($(human_mib "$rec_gttsize") in 4K pages)"
echo ""
# ── Apply changes ────────────────────────────────────────
if ! confirm "Apply these kernel parameters to GRUB?"; then
log_info "Skipped. You can apply manually by editing $GRUB_FILE"
exit 0
fi
# Backup
BACKUP_DIR="$(data_dir backups)"
backup_file="$BACKUP_DIR/grub-$(timestamp).bak"
cp "$GRUB_FILE" "$backup_file"
log_success "GRUB backup saved: $backup_file"
# Parse current GRUB_CMDLINE_LINUX using Python (data via env vars, not interpolation)
current_cmdline="$(GRUB_PATH="$GRUB_FILE" python3 -c '
import re, os
with open(os.environ["GRUB_PATH"]) as f:
for line in f:
m = re.match(r"^GRUB_CMDLINE_LINUX=\"(.*)\"", line)
if m:
print(m.group(1))
raise SystemExit(0)
print("")
')"
# Remove any existing values of these params
new_cmdline="$current_cmdline"
new_cmdline="$(echo "$new_cmdline" | sed -E 's/\biommu=[^ ]*//g')"
new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bamd_iommu=[^ ]*//g')"
new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bamdgpu\.gttsize=[^ ]*//g')"
new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bttm\.pages_limit=[^ ]*//g')"
# Clean up extra spaces
new_cmdline="$(echo "$new_cmdline" | xargs)"
# Add new params
new_cmdline="$new_cmdline iommu=pt amdgpu.gttsize=$rec_gttsize ttm.pages_limit=$rec_pages"
log_info "GRUB_CMDLINE_LINUX change:"
printf " ${RED}Before:${RESET} %s\n" "$current_cmdline"
printf " ${GREEN}After:${RESET} %s\n" "$new_cmdline"
echo ""
if ! confirm "Write this change?"; then
log_info "Aborted. Backup remains at: $backup_file"
exit 0
fi
# Apply using Python (all data via env vars — no shell interpolation into Python code)
GRUB_PATH="$GRUB_FILE" NEW_CMDLINE="$new_cmdline" python3 -c '
import re, os
grub_path = os.environ["GRUB_PATH"]
new_line = "GRUB_CMDLINE_LINUX=\"" + os.environ["NEW_CMDLINE"] + "\""
with open(grub_path) as f:
content = f.read()
content = re.sub(r"^GRUB_CMDLINE_LINUX=.*", new_line, content, count=1, flags=re.MULTILINE)
with open(grub_path, "w") as f:
f.write(content)
'
log_success "GRUB config updated"
# Regenerate GRUB — prefer grubby on modern Fedora (BLS), fall back to grub2-mkconfig
log_info "Regenerating boot configuration..."
if is_cmd grubby; then
grubby --update-kernel=ALL --args="iommu=pt amdgpu.gttsize=$rec_gttsize ttm.pages_limit=$rec_pages"
log_success "Boot entries updated via grubby"
elif [[ -d /boot/grub2 ]]; then
grub2-mkconfig -o /boot/grub2/grub.cfg
log_success "GRUB regenerated via grub2-mkconfig"
elif [[ -d /boot/grub ]]; then
grub-mkconfig -o /boot/grub/grub.cfg
log_success "GRUB regenerated via grub-mkconfig"
else
log_error "Could not find grubby or grub config directory. Regenerate manually."
exit 1
fi
echo ""
log_warn "REBOOT REQUIRED for kernel parameters to take effect."
log_info "After reboot, verify with: make audit"

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env bash
# Rollback optimization changes
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
GRUB_FILE="/etc/default/grub"
BACKUP_DIR="$(data_dir backups)"
log_header "Rollback Optimizations"
# ── 1. GRUB rollback ────────────────────────────────────
log_info "GRUB backups:"
mapfile -t grub_backups < <(find "$BACKUP_DIR" -name 'grub-*.bak' -print 2>/dev/null | sort -r)
if (( ${#grub_backups[@]} == 0 )); then
log_info " No GRUB backups found"
else
for i in "${!grub_backups[@]}"; do
printf " [%d] %s\n" "$i" "${grub_backups[$i]}"
done
echo ""
if confirm "Restore most recent GRUB backup?"; then
require_root
backup="${grub_backups[0]}"
cp "$backup" "$GRUB_FILE"
log_success "GRUB restored from: $backup"
log_info "Regenerating boot configuration..."
if is_cmd grubby; then
# On BLS systems, also need to remove args via grubby
grubby --update-kernel=ALL --remove-args="iommu amdgpu.gttsize ttm.pages_limit" 2>/dev/null || true
log_success "Boot entries updated via grubby"
elif [[ -d /boot/grub2 ]]; then
grub2-mkconfig -o /boot/grub2/grub.cfg
log_success "GRUB regenerated via grub2-mkconfig"
elif [[ -d /boot/grub ]]; then
grub-mkconfig -o /boot/grub/grub.cfg
log_success "GRUB regenerated via grub-mkconfig"
else
log_error "Could not find grubby or grub config directory. Regenerate manually."
fi
log_warn "Reboot required for changes to take effect."
fi
fi
# ── 2. Tuned profile rollback ───────────────────────────
prev_profile_file="$BACKUP_DIR/tuned-previous-profile.txt"
if [[ -f "$prev_profile_file" ]]; then
prev_profile="$(cat "$prev_profile_file")"
current="$(tuned-adm active 2>/dev/null | sed 's/Current active profile: //' || echo "unknown")"
log_info "Tuned profile: $current (previous: $prev_profile)"
if [[ "$current" != "$prev_profile" ]] && confirm "Restore tuned profile to $prev_profile?"; then
sudo tuned-adm profile "$prev_profile"
log_success "Tuned profile restored to: $prev_profile"
fi
else
log_info "No previous tuned profile saved"
fi
# ── 3. BIOS reminder ────────────────────────────────────
echo ""
log_warn "BIOS VRAM changes cannot be rolled back automatically."
log_info "To revert: Reboot → F10 → Advanced → UMA Frame Buffer Size → restore previous value"

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Switch tuned profile to accelerator-performance
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
RECOMMENDED="accelerator-performance"
log_header "Tuned Profile Optimization"
if ! is_cmd tuned-adm; then
log_error "tuned is not installed. Install with: sudo dnf install tuned"
exit 1
fi
current="$(detect_tuned_profile)"
log_info "Current profile: $current"
if [[ "$current" == "$RECOMMENDED" ]]; then
log_success "Already using $RECOMMENDED"
exit 0
fi
# Check availability
if ! tuned-adm list 2>/dev/null | grep -q "$RECOMMENDED"; then
log_error "$RECOMMENDED profile not available"
log_info "Available profiles:"
tuned-adm list 2>/dev/null | grep "^-" | sed 's/^/ /'
exit 1
fi
echo ""
log_info "Recommended: $RECOMMENDED"
log_info "Description: Throughput performance with disabled higher latency STOP states"
log_info "Benefit: 5-8% improvement in prompt processing (pp) benchmarks"
log_info "No reboot required."
echo ""
if ! confirm "Switch to $RECOMMENDED?"; then
log_info "Skipped"
exit 0
fi
# Save current for rollback
echo "$current" > "$(data_dir backups)/tuned-previous-profile.txt"
sudo tuned-adm profile "$RECOMMENDED"
new_profile="$(detect_tuned_profile)"
if [[ "$new_profile" == "$RECOMMENDED" ]]; then
log_success "Profile switched to: $new_profile"
else
log_error "Profile switch may have failed. Current: $new_profile"
fi

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env bash
# Post-optimization verification checklist
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
log_header "Optimization Verification"
score=0
total=0
check() {
local pass="$1" label="$2" detail="$3"
total=$(( total + 1 ))
if [[ "$pass" == "1" ]]; then
score=$(( score + 1 ))
print_status pass "$label" "$detail"
else
print_status fail "$label" "$detail"
fi
}
# Kernel version
kernel="$(detect_kernel_version)"
kernel_major=$(echo "$kernel" | cut -d. -f1)
kernel_minor=$(echo "$kernel" | cut -d. -f2)
kernel_ok=0
(( kernel_major > 6 || (kernel_major == 6 && kernel_minor >= 18) )) && kernel_ok=1
check "$kernel_ok" "Kernel >= 6.18.4" "$kernel"
# Firmware
fw_ok=1
detect_firmware_bad && fw_ok=0
check "$fw_ok" "Firmware (not 20251125)" "$(detect_firmware_version)"
# Kernel params
iommu_val="$(detect_kernel_param 'iommu')"
iommu_ok=0
[[ "$iommu_val" == "pt" ]] && iommu_ok=1
check "$iommu_ok" "iommu=pt" "${iommu_val:-not set}"
gttsize="$(detect_gttsize_param)"
rec_gttsize="$(recommended_gttsize_mib)"
gtt_ok=0
[[ -n "$gttsize" ]] && (( gttsize >= rec_gttsize )) && gtt_ok=1
check "$gtt_ok" "amdgpu.gttsize" "${gttsize:-not set} (recommended: $rec_gttsize)"
pages="$(detect_pages_limit_param)"
rec_pages="$(recommended_pages_limit)"
pages_ok=0
[[ -n "$pages" ]] && (( pages >= rec_pages )) && pages_ok=1
check "$pages_ok" "ttm.pages_limit" "${pages:-not set} (recommended: $rec_pages)"
# Tuned profile
tuned="$(detect_tuned_profile)"
tuned_ok=0
[[ "$tuned" == "accelerator-performance" ]] && tuned_ok=1
check "$tuned_ok" "Tuned profile" "$tuned"
# VRAM (should be <= 1 GiB)
vram="$(detect_vram_total)"
vram_gib=$(echo "scale=1; $vram / 1073741824" | bc)
vram_ok=0
(( vram <= 1073741824 )) && vram_ok=1
check "$vram_ok" "VRAM <= 1 GiB" "${vram_gib} GiB"
# GTT (should be close to recommended)
gtt="$(detect_gtt_total)"
gtt_gib=$(echo "scale=1; $gtt / 1073741824" | bc)
rec_gtt_bytes=$(( rec_gttsize * 1048576 ))
gtt_mem_ok=0
(( gtt >= rec_gtt_bytes * 3 / 4 )) && gtt_mem_ok=1
check "$gtt_mem_ok" "GTT >= $(human_mib "$rec_gttsize")" "${gtt_gib} GiB"
# GPU monitor installed
monitor_ok=0
is_cmd amdgpu_top && monitor_ok=1
check "$monitor_ok" "amdgpu_top installed" "$(is_cmd amdgpu_top && echo 'yes' || echo 'no — run make monitor-install')"
# Summary
echo ""
print_divider
printf "\n ${BOLD}Score: %d / %d${RESET}\n" "$score" "$total"
if (( score == total )); then
printf " ${GREEN}Fully optimized!${RESET} Run 'make benchmark' to measure performance.\n"
elif (( score >= total * 3 / 4 )); then
printf " ${YELLOW}Nearly there${RESET} — check the failed items above.\n"
elif (( score >= total / 2 )); then
printf " ${YELLOW}Partially optimized${RESET} — run 'make optimize' for the remaining items.\n"
else
printf " ${RED}Significant optimizations pending${RESET} — run 'make optimize'\n"
fi
echo ""

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# BIOS VRAM guidance + GTT verification
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
log_header "VRAM / GTT Memory Optimization"
vram_total="$(detect_vram_total)"
gtt_total="$(detect_gtt_total)"
vram_gib=$(echo "scale=1; $vram_total / 1073741824" | bc)
gtt_gib=$(echo "scale=1; $gtt_total / 1073741824" | bc)
log_info "Current memory allocation:"
print_kv "VRAM (dedicated)" "${vram_gib} GiB"
print_kv "GTT (dynamic)" "${gtt_gib} GiB"
print_kv "System RAM (visible)" "$(echo "scale=1; $(detect_system_ram_kb) / 1048576" | bc) GiB"
echo ""
# ── Check if BIOS VRAM change is needed ──────────────────
# Optimal: VRAM <= 1 GiB (0.5 GiB ideal), rest dynamically via GTT
if (( vram_total > 1073741824 )); then
log_warn "VRAM is ${vram_gib} GiB — this permanently locks memory away from the OS."
log_info "AMD recommends 512 MB dedicated VRAM for Strix Halo."
log_info "The GPU accesses additional memory dynamically via GTT (kernel params)."
echo ""
printf "${BOLD}BIOS Configuration Steps (HP ZBook Ultra G1a):${RESET}\n"
echo ""
echo " 1. Reboot the laptop"
echo " 2. Press F10 repeatedly during boot to enter BIOS Setup"
echo " 3. Navigate to: Advanced > Built-in Device Options"
echo " (or Advanced > Display > UMA Frame Buffer Size)"
echo " 4. Set UMA Frame Buffer Size to: 512 MB (or smallest available)"
echo " 5. Save and Exit (F10)"
echo ""
echo " NOTE: The exact menu path may vary by BIOS version."
echo " Look for 'UMA', 'Frame Buffer', 'VRAM', or 'iGPU Memory'."
echo ""
log_info "After BIOS change + reboot with kernel params, expected state:"
echo " VRAM: ~512 MiB"
echo " GTT: ~$(human_mib "$(recommended_gttsize_mib)") (with kernel params)"
echo " System RAM: ~$(echo "scale=1; $(detect_total_physical_ram_kb) / 1048576 - 0.5" | bc) GiB visible"
echo ""
elif (( vram_total <= 1073741824 )); then
log_success "VRAM is ${vram_gib} GiB — already optimal!"
fi
# ── Check GTT ────────────────────────────────────────────
rec_gttsize="$(recommended_gttsize_mib)"
rec_gtt_bytes=$(( rec_gttsize * 1048576 ))
if (( gtt_total >= rec_gtt_bytes * 3 / 4 )); then
log_success "GTT is ${gtt_gib} GiB — good (recommended: ~$(human_mib "$rec_gttsize"))"
else
log_warn "GTT is ${gtt_gib} GiB — low (recommended: ~$(human_mib "$rec_gttsize"))"
log_info "This requires kernel boot parameters. Run: make optimize-kernel"
fi
# ── Optional: amd-debug-tools ────────────────────────────
echo ""
log_header "Optional: amd-debug-tools (amd-ttm)"
log_info "AMD provides 'amd-debug-tools' for runtime GTT/TTM inspection."
if is_cmd amd-ttm; then
log_success "amd-ttm is installed"
log_info "Current GTT settings:"
amd-ttm 2>/dev/null || true
elif is_cmd pipx; then
log_info "Install with: pipx install amd-debug-tools"
if confirm "Install amd-debug-tools via pipx?"; then
pipx install amd-debug-tools
log_success "Installed. Run 'amd-ttm' to inspect GTT allocation."
fi
else
log_info "Install pipx first: sudo dnf install pipx"
log_info "Then: pipx install amd-debug-tools"
fi