From c596e38e9e39c3727051bd31af2048ea42c5aab4 Mon Sep 17 00:00:00 2001 From: Felipe Cardoso Date: Wed, 25 Mar 2026 20:13:15 +0100 Subject: [PATCH] Initial commit --- .gitignore | 5 + .idea/.gitignore | 10 ++ Makefile | 58 ++++++++ bin/audit | 18 +++ bin/benchmark | 20 +++ bin/monitor | 20 +++ bin/optimize | 31 +++++ configs/grub-cmdline.conf | 17 +++ docs/bios-vram-guide.md | 40 ++++++ lib/common.sh | 52 +++++++ lib/detect.sh | 197 ++++++++++++++++++++++++++ lib/format.sh | 74 ++++++++++ scripts/audit/quick-glance.sh | 180 ++++++++++++++++++++++++ scripts/audit/system-report.sh | 194 ++++++++++++++++++++++++++ scripts/benchmark/compare.sh | 140 +++++++++++++++++++ scripts/benchmark/run-baseline.sh | 223 ++++++++++++++++++++++++++++++ scripts/benchmark/run-suite.sh | 194 ++++++++++++++++++++++++++ scripts/benchmark/setup.sh | 106 ++++++++++++++ scripts/monitor/dashboard.sh | 90 ++++++++++++ scripts/monitor/install-tools.sh | 97 +++++++++++++ scripts/monitor/log-metrics.sh | 127 +++++++++++++++++ scripts/optimize/kernel-params.sh | 149 ++++++++++++++++++++ scripts/optimize/rollback.sh | 67 +++++++++ scripts/optimize/tuned-profile.sh | 56 ++++++++ scripts/optimize/verify.sh | 97 +++++++++++++ scripts/optimize/vram-gtt.sh | 83 +++++++++++ 26 files changed, 2345 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 Makefile create mode 100755 bin/audit create mode 100755 bin/benchmark create mode 100755 bin/monitor create mode 100755 bin/optimize create mode 100644 configs/grub-cmdline.conf create mode 100644 docs/bios-vram-guide.md create mode 100644 lib/common.sh create mode 100644 lib/detect.sh create mode 100644 lib/format.sh create mode 100644 scripts/audit/quick-glance.sh create mode 100644 scripts/audit/system-report.sh create mode 100644 scripts/benchmark/compare.sh create mode 100644 scripts/benchmark/run-baseline.sh create mode 100644 scripts/benchmark/run-suite.sh create mode 100644 scripts/benchmark/setup.sh create mode 100644 scripts/monitor/dashboard.sh create mode 100644 scripts/monitor/install-tools.sh create mode 100644 scripts/monitor/log-metrics.sh create mode 100644 scripts/optimize/kernel-params.sh create mode 100644 scripts/optimize/rollback.sh create mode 100644 scripts/optimize/tuned-profile.sh create mode 100644 scripts/optimize/verify.sh create mode 100644 scripts/optimize/vram-gtt.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d596613 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +data/ +*.log +*.csv +*.tmp +.claude/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..ab1f416 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Ignored default folder with query files +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..495c019 --- /dev/null +++ b/Makefile @@ -0,0 +1,58 @@ +.PHONY: help audit audit-full monitor monitor-simple benchmark benchmark-baseline benchmark-compare optimize verify + +help: ## Show available commands + @echo "Strix Halo Optimization Toolkit" + @echo "" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}' + +# --- Audit --- +audit: ## Quick system status (single screen) + @bash bin/audit --quick + +audit-full: ## Full system report (saved to data/audits/) + @bash bin/audit --full + +# --- Monitor --- +monitor: ## Launch tmux monitoring dashboard + @bash bin/monitor --dashboard + +monitor-simple: ## Launch amdgpu_top only + @bash bin/monitor --simple + +monitor-install: ## Install monitoring tools (amdgpu_top, btop) + @bash scripts/monitor/install-tools.sh + +monitor-log: ## Start background metric logger + @bash bin/monitor --log + +# --- Benchmark --- +benchmark-setup: ## Ensure toolboxes and test models are ready + @bash scripts/benchmark/setup.sh + +benchmark-baseline: ## Capture pre-optimization baseline + @bash bin/benchmark baseline + +benchmark: ## Run full benchmark suite + @bash bin/benchmark run + +benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare BEFORE=dir AFTER=dir) + @bash bin/benchmark compare $(BEFORE) $(AFTER) + +# --- Optimize --- +optimize: ## Interactive optimization walkthrough + @bash bin/optimize --all + +optimize-kernel: ## Configure kernel boot parameters + @bash scripts/optimize/kernel-params.sh + +optimize-tuned: ## Switch to accelerator-performance profile + @bash scripts/optimize/tuned-profile.sh + +optimize-vram: ## BIOS VRAM guidance + GTT verification + @bash scripts/optimize/vram-gtt.sh + +verify: ## Post-optimization verification checklist + @bash scripts/optimize/verify.sh + +rollback: ## Rollback optimizations + @bash scripts/optimize/rollback.sh diff --git a/bin/audit b/bin/audit new file mode 100755 index 0000000..a2d251a --- /dev/null +++ b/bin/audit @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# System audit dispatcher +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "${1:---quick}" in + --quick|-q) exec bash "$SCRIPT_DIR/scripts/audit/quick-glance.sh" ;; + --full|-f) exec bash "$SCRIPT_DIR/scripts/audit/system-report.sh" ;; + --json|-j) exec bash "$SCRIPT_DIR/scripts/audit/system-report.sh" --json ;; + *) + echo "Usage: audit [--quick|--full|--json]" + echo " --quick Single-screen system status (default)" + echo " --full Detailed report saved to data/audits/" + echo " --json JSON output to stdout" + exit 1 + ;; +esac diff --git a/bin/benchmark b/bin/benchmark new file mode 100755 index 0000000..ce0e400 --- /dev/null +++ b/bin/benchmark @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Benchmark dispatcher +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "${1:-help}" in + setup) exec bash "$SCRIPT_DIR/scripts/benchmark/setup.sh" ;; + baseline) exec bash "$SCRIPT_DIR/scripts/benchmark/run-baseline.sh" "${@:2}" ;; + run) exec bash "$SCRIPT_DIR/scripts/benchmark/run-suite.sh" "${@:2}" ;; + compare) exec bash "$SCRIPT_DIR/scripts/benchmark/compare.sh" "${@:2}" ;; + *) + echo "Usage: benchmark [options]" + echo " setup Ensure toolboxes and test models are ready" + echo " baseline Capture pre-optimization baseline" + echo " run Run full benchmark suite (--tag NAME, --backends LIST)" + echo " compare Compare two runs (DIR1 DIR2)" + exit 1 + ;; +esac diff --git a/bin/monitor b/bin/monitor new file mode 100755 index 0000000..71119a7 --- /dev/null +++ b/bin/monitor @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Monitoring dispatcher +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "${1:---dashboard}" in + --dashboard|-d) exec bash "$SCRIPT_DIR/scripts/monitor/dashboard.sh" ;; + --simple|-s) exec bash "$SCRIPT_DIR/scripts/monitor/dashboard.sh" --simple ;; + --log|-l) exec bash "$SCRIPT_DIR/scripts/monitor/log-metrics.sh" ;; + --install|-i) exec bash "$SCRIPT_DIR/scripts/monitor/install-tools.sh" ;; + *) + echo "Usage: monitor [--dashboard|--simple|--log|--install]" + echo " --dashboard Tmux 3-pane: GPU + system + metrics (default)" + echo " --simple amdgpu_top only" + echo " --log Start background CSV metric logger" + echo " --install Install monitoring tools" + exit 1 + ;; +esac diff --git a/bin/optimize b/bin/optimize new file mode 100755 index 0000000..406400b --- /dev/null +++ b/bin/optimize @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Optimization dispatcher +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "${1:---all}" in + --all|-a) + echo "Running optimization walkthrough..." + bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh" + bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh" + bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh" + echo "" + bash "$SCRIPT_DIR/scripts/optimize/verify.sh" + ;; + --kernel|-k) exec bash "$SCRIPT_DIR/scripts/optimize/kernel-params.sh" ;; + --tuned|-t) exec bash "$SCRIPT_DIR/scripts/optimize/tuned-profile.sh" ;; + --vram|-v) exec bash "$SCRIPT_DIR/scripts/optimize/vram-gtt.sh" ;; + --verify) exec bash "$SCRIPT_DIR/scripts/optimize/verify.sh" ;; + --rollback) exec bash "$SCRIPT_DIR/scripts/optimize/rollback.sh" ;; + *) + echo "Usage: optimize [--all|--kernel|--tuned|--vram|--verify|--rollback]" + echo " --all Full optimization walkthrough (default)" + echo " --kernel Configure kernel boot parameters" + echo " --tuned Switch tuned profile" + echo " --vram BIOS VRAM + GTT guidance" + echo " --verify Post-optimization checklist" + echo " --rollback Revert changes" + exit 1 + ;; +esac diff --git a/configs/grub-cmdline.conf b/configs/grub-cmdline.conf new file mode 100644 index 0000000..0997d85 --- /dev/null +++ b/configs/grub-cmdline.conf @@ -0,0 +1,17 @@ +# Recommended kernel boot parameters for AMD Strix Halo +# Add to GRUB_CMDLINE_LINUX in /etc/default/grub +# +# After editing, regenerate GRUB: +# sudo grub2-mkconfig -o /boot/grub2/grub.cfg +# Then reboot. +# +# For 64GB system (HP ZBook Ultra G1a): +iommu=pt amdgpu.gttsize=60416 ttm.pages_limit=15466496 +# +# For 128GB system (Framework Desktop, GMKtec EVO X2): +# iommu=pt amdgpu.gttsize=126976 ttm.pages_limit=32505856 +# +# Parameter explanation: +# iommu=pt - IOMMU passthrough, reduces memory access latency +# amdgpu.gttsize=N - Max GPU-addressable system RAM in MiB (total - 4GB reserve) +# ttm.pages_limit=N - Max pinnable 4K pages (gttsize_MiB * 256) diff --git a/docs/bios-vram-guide.md b/docs/bios-vram-guide.md new file mode 100644 index 0000000..e8fa32b --- /dev/null +++ b/docs/bios-vram-guide.md @@ -0,0 +1,40 @@ +# BIOS VRAM Configuration — HP ZBook Ultra G1a + +## Why Change VRAM? + +AMD Strix Halo uses **unified memory** — the CPU and GPU share the same physical RAM. By default, the HP ZBook allocates **32 GB as dedicated VRAM**, permanently locking that memory away from the OS even when the GPU isn't using it. + +AMD recommends keeping dedicated VRAM at **512 MB** (minimum) and using **GTT (Graphics Translation Table)** for dynamic GPU memory access. With kernel boot parameters, the GPU can access up to ~60 GB on demand while the CPU retains full flexibility. + +## Current vs Optimal (64 GB system) + +| Setting | Default | Optimal | +|---------|---------|---------| +| Dedicated VRAM | 32 GB | 0.5 GB | +| GTT (dynamic) | ~15.5 GB | ~59 GB | +| OS visible RAM | ~31 GB | ~63.5 GB | + +## Steps + +1. **Reboot** the laptop +2. Press **F10** repeatedly during boot to enter BIOS Setup +3. Navigate to: **Advanced** > **Built-in Device Options** (or **Display** section) +4. Find: **UMA Frame Buffer Size** (may also be labeled "iGPU Memory" or "VRAM Size") +5. Set to: **512 MB** (or the smallest available option) +6. **Save and Exit** (F10) + +> The exact menu path may vary by BIOS version. If you can't find it under "Built-in Device Options", check under "Advanced > Display" or "Chipset Configuration". + +## After BIOS Change + +1. Ensure kernel boot parameters are configured (run `make optimize-kernel`) +2. Reboot +3. Verify with `make audit`: + - VRAM should show ~0.5 GiB + - GTT should show ~59 GiB + - System RAM should show ~63.5 GiB + +## References + +- [AMD ROCm Strix Halo Guide](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/strixhalo.html) +- [Strix Halo Toolboxes](https://strix-halo-toolboxes.com/) diff --git a/lib/common.sh b/lib/common.sh new file mode 100644 index 0000000..1a01941 --- /dev/null +++ b/lib/common.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Common utilities for strix-halo-optimizations scripts + +set -euo pipefail + +# Auto-detect project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[1]:-${BASH_SOURCE[0]}}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR" && while [[ ! -f Makefile ]] && [[ "$PWD" != "/" ]]; do cd ..; done; pwd)" +if [[ "$PROJECT_ROOT" == "/" ]]; then + # Fallback: assume lib/ is one level below project root + PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +fi + +# Colors (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m' + BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m' + DIM='\033[2m'; RESET='\033[0m' +else + RED=''; GREEN=''; YELLOW=''; BLUE=''; CYAN=''; BOLD=''; DIM=''; RESET='' +fi + +log_info() { printf "${BLUE}[INFO]${RESET} %s\n" "$*"; } +log_success() { printf "${GREEN}[OK]${RESET} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$*"; } +log_error() { printf "${RED}[ERR]${RESET} %s\n" "$*" >&2; } +log_header() { printf "\n${BOLD}=== %s ===${RESET}\n" "$*"; } + +is_cmd() { command -v "$1" &>/dev/null; } + +require_root() { + if [[ $EUID -ne 0 ]]; then + log_error "This script requires root privileges. Run with sudo." + exit 1 + fi +} + +confirm() { + local prompt="${1:-Continue?}" + printf "${YELLOW}%s [y/N] ${RESET}" "$prompt" + read -r reply + [[ "$reply" =~ ^[Yy]$ ]] +} + +data_dir() { + local subdir="${1:-.}" + local dir="$PROJECT_ROOT/data/$subdir" + mkdir -p "$dir" + echo "$dir" +} + +timestamp() { date '+%Y%m%d-%H%M%S'; } diff --git a/lib/detect.sh b/lib/detect.sh new file mode 100644 index 0000000..4087952 --- /dev/null +++ b/lib/detect.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash +# Hardware and configuration detection for Strix Halo + +# Find the amdgpu DRM card path +find_gpu_card() { + local card + for card in /sys/class/drm/card*/device/vendor; do + if [[ -f "$card" ]] && [[ "$(cat "$card")" == "0x1002" ]]; then + echo "$(dirname "$card")" + return 0 + fi + done + # Fallback: try any card with mem_info_vram_total (i.e., an amdgpu device) + for card in /sys/class/drm/card*/device/mem_info_vram_total; do + if [[ -f "$card" ]]; then + echo "$(dirname "$card")" + return 0 + fi + done + echo "/sys/class/drm/card1/device" # last resort +} + +GPU_SYSFS="$(find_gpu_card)" + +# --- CPU --- +detect_cpu_model() { grep -m1 'model name' /proc/cpuinfo | cut -d: -f2 | xargs; } +detect_cpu_cores() { grep -c '^processor' /proc/cpuinfo; } +detect_cpu_physical() { grep 'cpu cores' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs; } + +# --- GPU --- +detect_gpu_name() { + lspci | grep -i 'Display\|VGA' | grep -i 'AMD' | head -1 | sed 's/.*: //' +} + +detect_gpu_device_id() { + cat "$GPU_SYSFS/device" 2>/dev/null | sed 's/^0x//' +} + +# --- Memory (bytes) --- +detect_vram_total() { cat "$GPU_SYSFS/mem_info_vram_total" 2>/dev/null || echo 0; } +detect_vram_used() { cat "$GPU_SYSFS/mem_info_vram_used" 2>/dev/null || echo 0; } +detect_gtt_total() { cat "$GPU_SYSFS/mem_info_gtt_total" 2>/dev/null || echo 0; } +detect_gtt_used() { cat "$GPU_SYSFS/mem_info_gtt_used" 2>/dev/null || echo 0; } + +detect_system_ram_kb() { + local kb + kb="$(grep MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}')" + echo "${kb:-0}" +} +detect_system_ram_bytes() { echo $(( $(detect_system_ram_kb) * 1024 )); } + +# --- Kernel --- +detect_kernel_version() { uname -r; } + +detect_kernel_param() { + # Returns the value of a kernel param, or empty if not present + local param="$1" + local cmdline + cmdline="$(cat /proc/cmdline)" + # Escape dots for regex and anchor with word boundary (space or start-of-string) + local pattern="${param//./\\.}" + if [[ "$cmdline" =~ (^|[[:space:]])${pattern}=([^ ]+) ]]; then + echo "${BASH_REMATCH[2]}" + elif [[ "$cmdline" =~ (^|[[:space:]])${pattern}([[:space:]]|$) ]]; then + echo "present" + fi +} + +detect_has_iommu_pt() { + local val + val="$(detect_kernel_param 'iommu')" + [[ "$val" == "pt" ]] +} + +detect_gttsize_param() { detect_kernel_param 'amdgpu.gttsize'; } +detect_pages_limit_param() { detect_kernel_param 'ttm.pages_limit'; } + +# --- Tuned --- +detect_tuned_profile() { + if is_cmd tuned-adm; then + tuned-adm active 2>/dev/null | sed 's/Current active profile: //' + else + echo "tuned not installed" + fi +} + +# --- Firmware --- +detect_firmware_version() { + rpm -q linux-firmware 2>/dev/null | sed 's/linux-firmware-//' | sed 's/\.fc.*//' || echo "unknown" +} + +detect_firmware_bad() { + # Returns 0 (true) if firmware is the known-bad version + local fw + fw="$(detect_firmware_version)" + [[ "$fw" == *"20251125"* ]] +} + +# --- ROCm --- +detect_rocm_version() { + if [[ -f /opt/rocm/.info/version ]]; then + cat /opt/rocm/.info/version + else + rpm -qa 2>/dev/null | grep '^rocm-core-' | head -1 | sed 's/rocm-core-//' | sed 's/-.*//' || echo "not installed" + fi +} + +detect_rocm_packages() { + rpm -qa 2>/dev/null | grep -i rocm | sort +} + +# --- Vulkan --- +detect_vulkan_driver() { + if is_cmd vulkaninfo; then + vulkaninfo --summary 2>/dev/null | grep 'driverName' | head -1 | awk '{print $NF}' + else + echo "vulkaninfo not available" + fi +} + +detect_vulkan_version() { + if is_cmd vulkaninfo; then + vulkaninfo --summary 2>/dev/null | grep 'apiVersion' | head -1 | awk '{print $NF}' + fi +} + +# --- Toolbox containers --- +detect_toolboxes() { + if is_cmd toolbox; then + toolbox list --containers 2>/dev/null | tail -n +2 + fi +} + +detect_toolbox_names() { + detect_toolboxes | awk '{print $2}' 2>/dev/null +} + +# --- LLM stacks --- +detect_stack_ollama() { is_cmd ollama && echo "installed" || echo "missing"; } +detect_stack_lmstudio() { is_cmd lms && echo "installed" || echo "missing"; } +detect_stack_llamacpp() { (is_cmd llama-cli || is_cmd llama-bench) && echo "installed" || echo "missing"; } +detect_stack_opencode() { is_cmd opencode && echo "installed" || echo "missing"; } + +# --- Sensors --- +detect_gpu_temp() { + # Returns temperature in millidegrees C + local hwmon + for hwmon in "$GPU_SYSFS"/hwmon/hwmon*/temp1_input; do + if [[ -f "$hwmon" ]]; then + cat "$hwmon" + return + fi + done + echo 0 +} + +detect_gpu_power() { + # Returns power in microwatts + local hwmon + for hwmon in "$GPU_SYSFS"/hwmon/hwmon*/power1_average; do + if [[ -f "$hwmon" ]]; then + cat "$hwmon" + return + fi + done + echo 0 +} + +detect_gpu_busy() { + cat "$GPU_SYSFS/gpu_busy_percent" 2>/dev/null || echo 0 +} + +# --- Total physical memory (visible + VRAM dedicated) --- +detect_total_physical_ram_kb() { + local visible_kb vram_bytes vram_kb + visible_kb="$(detect_system_ram_kb)" + vram_bytes="$(detect_vram_total)" + vram_kb=$(( vram_bytes / 1024 )) + echo $(( visible_kb + vram_kb )) +} + +# --- Recommended values for this system --- +recommended_gttsize_mib() { + # Total physical RAM (including VRAM allocation) minus 4 GiB reserve, in MiB + local total_kb + total_kb="$(detect_total_physical_ram_kb)" + local total_gib=$(( total_kb / 1024 / 1024 )) + local gtt_gib=$(( total_gib - 4 )) + echo $(( gtt_gib * 1024 )) +} + +recommended_pages_limit() { + # GTT GiB * 1024 MiB/GiB * 256 pages/MiB + local gtt_mib + gtt_mib="$(recommended_gttsize_mib)" + echo $(( gtt_mib * 256 )) +} diff --git a/lib/format.sh b/lib/format.sh new file mode 100644 index 0000000..ea4e94b --- /dev/null +++ b/lib/format.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# Formatting utilities +# Requires: lib/common.sh must be sourced first (provides color variables) + +# Guard: ensure color variables are defined (sourced from common.sh) +: "${GREEN:=}" "${RED:=}" "${YELLOW:=}" "${CYAN:=}" "${BOLD:=}" "${DIM:=}" "${RESET:=}" + +human_bytes() { + local bytes="${1:-0}" + if (( bytes >= 1073741824 )); then + local val + val="$(echo "scale=1; $bytes / 1073741824" | bc)" + printf "%s GiB" "${val/#./0.}" + elif (( bytes >= 1048576 )); then + printf "%d MiB" "$(( bytes / 1048576 ))" + elif (( bytes >= 1024 )); then + printf "%d KiB" "$(( bytes / 1024 ))" + else + printf "%d B" "$bytes" + fi +} + +human_mib() { + local mib="${1:-0}" + if (( mib >= 1024 )); then + local val + val="$(echo "scale=1; $mib / 1024" | bc)" + printf "%s GiB" "${val/#./0.}" + else + printf "%d MiB" "$mib" + fi +} + +# Status indicators +STATUS_PASS="${GREEN}[OK]${RESET}" +STATUS_FAIL="${RED}[!!]${RESET}" +STATUS_WARN="${YELLOW}[??]${RESET}" +STATUS_INFO="${CYAN}[--]${RESET}" + +print_status() { + # Usage: print_status pass|fail|warn|info "label" "detail" + local kind="$1" label="$2" detail="${3:-}" + local indicator + case "$kind" in + pass) indicator="$STATUS_PASS" ;; + fail) indicator="$STATUS_FAIL" ;; + warn) indicator="$STATUS_WARN" ;; + *) indicator="$STATUS_INFO" ;; + esac + printf " %b %-30s %s\n" "$indicator" "$label" "$detail" +} + +print_kv() { + local key="$1" value="$2" + printf " %b%-24s%b %s\n" "$DIM" "$key:" "$RESET" "$value" +} + +print_divider() { + printf "%b%s%b\n" "$DIM" "$(printf '%.0s─' {1..60})" "$RESET" +} + +# Table helpers — format strings are caller-controlled constants, not user input +print_table_header() { + local fmt="$1"; shift + # shellcheck disable=SC2059 — format string is a trusted constant from callers + printf "${BOLD}${fmt}${RESET}\n" "$@" + print_divider +} + +print_table_row() { + local fmt="$1"; shift + # shellcheck disable=SC2059 — format string is a trusted constant from callers + printf "${fmt}\n" "$@" +} diff --git a/scripts/audit/quick-glance.sh b/scripts/audit/quick-glance.sh new file mode 100644 index 0000000..4621d58 --- /dev/null +++ b/scripts/audit/quick-glance.sh @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# Quick-glance system audit — single screen status overview +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +# ── Gather data ────────────────────────────────────────── +cpu_model="$(detect_cpu_model)" +cpu_threads="$(detect_cpu_cores)" +cpu_physical="$(detect_cpu_physical)" +gpu_name="$(detect_gpu_name)" +kernel="$(detect_kernel_version)" +firmware="$(detect_firmware_version)" + +vram_total="$(detect_vram_total)" +vram_used="$(detect_vram_used)" +gtt_total="$(detect_gtt_total)" +gtt_used="$(detect_gtt_used)" +ram_kb="$(detect_system_ram_kb)" +ram_bytes=$(( ram_kb * 1024 )) + +param_iommu="$(detect_kernel_param 'iommu')" +param_gttsize="$(detect_gttsize_param)" +param_pages="$(detect_pages_limit_param)" + +tuned="$(detect_tuned_profile)" + +rocm_ver="$(detect_rocm_version)" +vulkan_drv="$(detect_vulkan_driver)" +vulkan_ver="$(detect_vulkan_version)" + +rec_gttsize="$(recommended_gttsize_mib)" +rec_pages="$(recommended_pages_limit)" + +# ── Score tracking ─────────────────────────────────────── +score=0 +total=0 + +check() { + local pass="$1" label="$2" detail="$3" + total=$(( total + 1 )) + if [[ "$pass" == "1" ]]; then + score=$(( score + 1 )) + print_status pass "$label" "$detail" + else + print_status fail "$label" "$detail" + fi +} + +check_warn() { + local label="$1" detail="$2" + print_status warn "$label" "$detail" +} + +check_info() { + local label="$1" detail="$2" + print_status info "$label" "$detail" +} + +# ── Output ─────────────────────────────────────────────── +printf "\n${BOLD}${CYAN}" +cat << 'BANNER' + ╔═══════════════════════════════════════════╗ + ║ AMD Strix Halo — System Status ║ + ╚═══════════════════════════════════════════╝ +BANNER +printf "${RESET}" + +# Hardware +log_header "Hardware" +print_kv "CPU" "$cpu_model (${cpu_physical}C/${cpu_threads}T)" +print_kv "GPU" "$gpu_name" +print_kv "System RAM (visible)" "$(human_bytes "$ram_bytes")" + +# Kernel & Firmware +log_header "Kernel & Firmware" +kernel_major=$(echo "$kernel" | cut -d. -f1) +kernel_minor=$(echo "$kernel" | cut -d. -f2) +kernel_ok=0 +if (( kernel_major > 6 )) || (( kernel_major == 6 && kernel_minor >= 18 )); then + kernel_ok=1 +fi +check "$kernel_ok" "Kernel version" "$kernel (need >= 6.18.4)" + +firmware_ok=1 +firmware_note="$firmware" +if detect_firmware_bad; then + firmware_ok=0 + firmware_note="$firmware (KNOWN BAD — causes ROCm crashes!)" +fi +check "$firmware_ok" "Firmware" "$firmware_note" + +# Memory allocation +log_header "Memory Allocation" +vram_gib=$(echo "scale=1; $vram_total / 1073741824" | bc) +gtt_gib=$(echo "scale=1; $gtt_total / 1073741824" | bc) + +# VRAM: should be <= 1 GiB (ideally 0.5 GiB) +vram_ok=0 +(( vram_total <= 1073741824 )) && vram_ok=1 +check "$vram_ok" "VRAM (dedicated)" "${vram_gib} GiB$([ "$vram_ok" -eq 0 ] && echo " — should be 0.5 GiB in BIOS")" + +# GTT: should be close to recommended (at least 75%) +gtt_rec_bytes=$(( rec_gttsize * 1048576 )) +gtt_ok=0 +(( gtt_total >= gtt_rec_bytes * 3 / 4 )) && gtt_ok=1 +check "$gtt_ok" "GTT (dynamic)" "${gtt_gib} GiB$([ "$gtt_ok" -eq 0 ] && echo " — should be ~$(human_mib "$rec_gttsize") with kernel params")" + +print_kv "VRAM in use" "$(human_bytes "$vram_used")" +print_kv "GTT in use" "$(human_bytes "$gtt_used")" + +# Kernel boot parameters +log_header "Kernel Boot Parameters" +iommu_ok=0 +[[ "$param_iommu" == "pt" ]] && iommu_ok=1 +check "$iommu_ok" "iommu=pt" "$([ -n "$param_iommu" ] && echo "current: $param_iommu" || echo "MISSING")" + +gtt_param_ok=0 +[[ -n "$param_gttsize" ]] && gtt_param_ok=1 +check "$gtt_param_ok" "amdgpu.gttsize" "$([ -n "$param_gttsize" ] && echo "current: ${param_gttsize} MiB" || echo "MISSING — recommended: ${rec_gttsize}")" + +pages_ok=0 +[[ -n "$param_pages" ]] && pages_ok=1 +check "$pages_ok" "ttm.pages_limit" "$([ -n "$param_pages" ] && echo "current: $param_pages" || echo "MISSING — recommended: ${rec_pages}")" + +# Tuned profile +log_header "Performance Profile" +tuned_ok=0 +[[ "$tuned" == "accelerator-performance" ]] && tuned_ok=1 +check "$tuned_ok" "Tuned profile" "$tuned$([ "$tuned_ok" -eq 0 ] && echo " — recommended: accelerator-performance")" + +# Software stack +log_header "Software Stack" +check_info "ROCm" "$rocm_ver" +check_info "Vulkan" "$vulkan_drv $vulkan_ver" + +# Toolboxes +toolbox_count=0 +if is_cmd toolbox; then + toolbox_count=$(detect_toolbox_names | wc -l) +fi +if (( toolbox_count > 0 )); then + check_info "Toolbox containers" "$toolbox_count available" + detect_toolbox_names | while read -r name; do + printf " ${DIM}%s${RESET}\n" "$name" + done +else + check_warn "Toolbox containers" "none — run 'make benchmark-setup'" +fi + +# LLM stacks +log_header "LLM Stacks" +check_info "LM Studio" "$(detect_stack_lmstudio)" +check_info "opencode" "$(detect_stack_opencode)" +check_info "ollama" "$(detect_stack_ollama)" +check_info "llama.cpp (native)" "$(detect_stack_llamacpp)" + +# Sensors +log_header "Current Sensors" +gpu_temp="$(detect_gpu_temp)" +gpu_power="$(detect_gpu_power)" +gpu_busy="$(detect_gpu_busy)" +print_kv "GPU Temperature" "$(echo "scale=1; $gpu_temp / 1000" | bc) C" +print_kv "GPU Power" "$(echo "scale=1; $gpu_power / 1000000" | bc) W" +print_kv "GPU Utilization" "${gpu_busy}%" + +# Overall score +log_header "Optimization Score" +printf "\n ${BOLD}%d / %d${RESET} checks passing\n" "$score" "$total" +if (( score == total )); then + printf " ${GREEN}System is fully optimized!${RESET}\n" +elif (( score >= total / 2 )); then + printf " ${YELLOW}Partially optimized — run 'make optimize' for improvements${RESET}\n" +else + printf " ${RED}Significant optimizations available — run 'make optimize'${RESET}\n" +fi +echo "" diff --git a/scripts/audit/system-report.sh b/scripts/audit/system-report.sh new file mode 100644 index 0000000..bf4cdb5 --- /dev/null +++ b/scripts/audit/system-report.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# Full system report — detailed audit with JSON + text output +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +JSON_ONLY=false +[[ "${1:-}" == "--json" ]] && JSON_ONLY=true + +# ── Gather all data ────────────────────────────────────── +ts="$(timestamp)" +cpu_model="$(detect_cpu_model)" +cpu_threads="$(detect_cpu_cores)" +cpu_physical="$(detect_cpu_physical)" +gpu_name="$(detect_gpu_name)" +gpu_device_id="$(detect_gpu_device_id)" +kernel="$(detect_kernel_version)" +firmware="$(detect_firmware_version)" + +vram_total="$(detect_vram_total)" +vram_used="$(detect_vram_used)" +gtt_total="$(detect_gtt_total)" +gtt_used="$(detect_gtt_used)" +ram_kb="$(detect_system_ram_kb)" + +param_iommu="$(detect_kernel_param 'iommu')" +param_gttsize="$(detect_gttsize_param)" +param_pages="$(detect_pages_limit_param)" +cmdline="$(cat /proc/cmdline)" + +tuned="$(detect_tuned_profile)" +rocm_ver="$(detect_rocm_version)" +vulkan_drv="$(detect_vulkan_driver)" +vulkan_ver="$(detect_vulkan_version)" + +gpu_temp="$(detect_gpu_temp)" +gpu_power="$(detect_gpu_power)" +gpu_busy="$(detect_gpu_busy)" + +rec_gttsize="$(recommended_gttsize_mib)" +rec_pages="$(recommended_pages_limit)" + +# Toolbox list +toolboxes_json="[]" +if is_cmd toolbox; then + toolboxes_json="$(detect_toolbox_names | jq -R . | jq -s . 2>/dev/null || echo '[]')" +fi + +# LLM stacks +stack_ollama="$(detect_stack_ollama)" +stack_lmstudio="$(detect_stack_lmstudio)" +stack_llamacpp="$(detect_stack_llamacpp)" +stack_opencode="$(detect_stack_opencode)" + +# ROCm packages +rocm_pkgs="$(detect_rocm_packages | head -30)" + +# ── Build JSON (all data via env vars — no shell interpolation into Python) ── +json_report="$( + SR_TS="$ts" \ + SR_CPU_MODEL="$cpu_model" SR_CPU_CORES="$cpu_physical" SR_CPU_THREADS="$cpu_threads" \ + SR_GPU_NAME="$gpu_name" SR_GPU_DEVICE_ID="$gpu_device_id" SR_RAM_KB="$ram_kb" \ + SR_VRAM_TOTAL="$vram_total" SR_VRAM_USED="$vram_used" \ + SR_GTT_TOTAL="$gtt_total" SR_GTT_USED="$gtt_used" \ + SR_REC_GTTSIZE="$rec_gttsize" SR_REC_PAGES="$rec_pages" \ + SR_KERNEL="$kernel" SR_CMDLINE="$cmdline" \ + SR_PARAM_IOMMU="$param_iommu" SR_PARAM_GTTSIZE="$param_gttsize" SR_PARAM_PAGES="$param_pages" \ + SR_FIRMWARE="$firmware" SR_TUNED="$tuned" SR_ROCM="$rocm_ver" \ + SR_VULKAN_DRV="$vulkan_drv" SR_VULKAN_VER="${vulkan_ver:-}" \ + SR_GPU_TEMP="$gpu_temp" SR_GPU_POWER="$gpu_power" SR_GPU_BUSY="$gpu_busy" \ + SR_TOOLBOXES="$toolboxes_json" \ + SR_STACK_OLLAMA="$stack_ollama" SR_STACK_LMSTUDIO="$stack_lmstudio" \ + SR_STACK_LLAMACPP="$stack_llamacpp" SR_STACK_OPENCODE="$stack_opencode" \ + python3 -c ' +import json, os +e = os.environ +data = { + "timestamp": e["SR_TS"], + "hardware": { + "cpu_model": e["SR_CPU_MODEL"], + "cpu_cores": int(e["SR_CPU_CORES"]), + "cpu_threads": int(e["SR_CPU_THREADS"]), + "gpu_name": e["SR_GPU_NAME"], + "gpu_device_id": e["SR_GPU_DEVICE_ID"], + "system_ram_kb": int(e["SR_RAM_KB"]), + }, + "memory": { + "vram_total_bytes": int(e["SR_VRAM_TOTAL"]), + "vram_used_bytes": int(e["SR_VRAM_USED"]), + "gtt_total_bytes": int(e["SR_GTT_TOTAL"]), + "gtt_used_bytes": int(e["SR_GTT_USED"]), + "recommended_gttsize_mib": int(e["SR_REC_GTTSIZE"]), + "recommended_pages_limit": int(e["SR_REC_PAGES"]), + }, + "kernel": { + "version": e["SR_KERNEL"], + "cmdline": e["SR_CMDLINE"], + "param_iommu": e["SR_PARAM_IOMMU"], + "param_gttsize": e["SR_PARAM_GTTSIZE"], + "param_pages_limit": e["SR_PARAM_PAGES"], + }, + "firmware": e["SR_FIRMWARE"], + "tuned_profile": e["SR_TUNED"], + "rocm_version": e["SR_ROCM"], + "vulkan": { + "driver": e["SR_VULKAN_DRV"], + "version": e["SR_VULKAN_VER"], + }, + "sensors": { + "gpu_temp_mc": int(e["SR_GPU_TEMP"]), + "gpu_power_uw": int(e["SR_GPU_POWER"]), + "gpu_busy_pct": int(e["SR_GPU_BUSY"]), + }, + "toolboxes": json.loads(e["SR_TOOLBOXES"]), + "stacks": { + "ollama": e["SR_STACK_OLLAMA"], + "lmstudio": e["SR_STACK_LMSTUDIO"], + "llamacpp": e["SR_STACK_LLAMACPP"], + "opencode": e["SR_STACK_OPENCODE"], + }, +} +print(json.dumps(data, indent=2)) +' +)" + +if $JSON_ONLY; then + echo "$json_report" | python3 -m json.tool 2>/dev/null || echo "$json_report" + exit 0 +fi + +# ── Save report ────────────────────────────────────────── +audit_dir="$(data_dir audits)" +json_file="$audit_dir/report-${ts}.json" +text_file="$audit_dir/report-${ts}.txt" + +echo "$json_report" | python3 -m json.tool > "$json_file" 2>/dev/null || echo "$json_report" > "$json_file" + +# ── Text output (also saved) ──────────────────────────── +{ + printf "Strix Halo Full System Report — %s\n" "$ts" + printf "=%.0s" {1..60}; echo + + printf "\nHardware:\n" + printf " CPU: %s (%sC/%sT)\n" "$cpu_model" "$cpu_physical" "$cpu_threads" + printf " GPU: %s (device: 0x%s)\n" "$gpu_name" "$gpu_device_id" + printf " RAM: %s KB\n" "$ram_kb" + + printf "\nMemory Allocation:\n" + printf " VRAM total: %s (used: %s)\n" "$(human_bytes "$vram_total")" "$(human_bytes "$vram_used")" + printf " GTT total: %s (used: %s)\n" "$(human_bytes "$gtt_total")" "$(human_bytes "$gtt_used")" + printf " Recommended: gttsize=%s MiB, pages_limit=%s\n" "$rec_gttsize" "$rec_pages" + + printf "\nKernel:\n" + printf " Version: %s\n" "$kernel" + printf " Firmware: %s\n" "$firmware" + printf " Cmdline: %s\n" "$cmdline" + printf " iommu: %s\n" "${param_iommu:-not set}" + printf " gttsize: %s\n" "${param_gttsize:-not set}" + printf " pages_limit:%s\n" "${param_pages:-not set}" + + printf "\nPerformance:\n" + printf " Tuned: %s\n" "$tuned" + printf " GPU temp: %s C\n" "$(echo "scale=1; $gpu_temp / 1000" | bc)" + printf " GPU power: %s W\n" "$(echo "scale=1; $gpu_power / 1000000" | bc)" + printf " GPU busy: %s%%\n" "$gpu_busy" + + printf "\nSoftware:\n" + printf " ROCm: %s\n" "$rocm_ver" + printf " Vulkan: %s %s\n" "$vulkan_drv" "$vulkan_ver" + + printf "\nROCm Packages:\n" + echo "$rocm_pkgs" | sed 's/^/ /' + + printf "\nToolboxes:\n" + if [[ "$toolboxes_json" == "[]" ]]; then + printf " none\n" + else + echo "$toolboxes_json" | python3 -c "import sys,json; [print(f' {x}') for x in json.load(sys.stdin)]" 2>/dev/null || printf " (parse error)\n" + fi + + printf "\nLLM Stacks:\n" + printf " ollama: %s\n" "$stack_ollama" + printf " LM Studio: %s\n" "$stack_lmstudio" + printf " llama.cpp: %s\n" "$stack_llamacpp" + printf " opencode: %s\n" "$stack_opencode" +} | tee "$text_file" + +echo "" +log_success "Report saved to:" +log_info " JSON: $json_file" +log_info " Text: $text_file" diff --git a/scripts/benchmark/compare.sh b/scripts/benchmark/compare.sh new file mode 100644 index 0000000..bf4a9a5 --- /dev/null +++ b/scripts/benchmark/compare.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# Compare two benchmark runs side-by-side +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +if [[ $# -lt 2 ]]; then + echo "Usage: benchmark compare " + echo "" + echo "Examples:" + echo " bin/benchmark compare data/baselines/20260325-120000 data/benchmarks/post-opt-20260326-100000" + echo "" + echo "Available baselines:" + ls -d "$(data_dir baselines)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)" + echo "" + echo "Available benchmark runs:" + ls -d "$(data_dir benchmarks)"/*/ 2>/dev/null | sed 's|/$||' || echo " (none)" + exit 1 +fi + +BEFORE_DIR="$1" +AFTER_DIR="$2" + +for d in "$BEFORE_DIR" "$AFTER_DIR"; do + if [[ ! -f "$d/summary.json" ]]; then + log_error "No summary.json in $d" + exit 1 + fi +done + +log_header "Benchmark Comparison" + +# Extract timestamps from directory names +before_name="$(basename "$BEFORE_DIR")" +after_name="$(basename "$AFTER_DIR")" +log_info "Before: $before_name" +log_info "After: $after_name" + +# Show system state diff if available +if [[ -f "$BEFORE_DIR/system-state.json" ]] && [[ -f "$AFTER_DIR/system-state.json" ]]; then + echo "" + python3 - "$BEFORE_DIR/system-state.json" "$AFTER_DIR/system-state.json" << 'PYEOF' +import sys, json + +with open(sys.argv[1]) as f: + before = json.load(f) +with open(sys.argv[2]) as f: + after = json.load(f) + +changes = [] +# Check key config differences +b_mem = before.get("memory", {}) +a_mem = after.get("memory", {}) +if b_mem.get("vram_total_bytes") != a_mem.get("vram_total_bytes"): + bv = b_mem.get("vram_total_bytes", 0) / 2**30 + av = a_mem.get("vram_total_bytes", 0) / 2**30 + changes.append(f" VRAM: {bv:.1f} GiB -> {av:.1f} GiB") +if b_mem.get("gtt_total_bytes") != a_mem.get("gtt_total_bytes"): + bg = b_mem.get("gtt_total_bytes", 0) / 2**30 + ag = a_mem.get("gtt_total_bytes", 0) / 2**30 + changes.append(f" GTT: {bg:.1f} GiB -> {ag:.1f} GiB") + +b_kern = before.get("kernel", {}) +a_kern = after.get("kernel", {}) +for param in ["param_iommu", "param_gttsize", "param_pages_limit"]: + bv = b_kern.get(param, "") + av = a_kern.get(param, "") + if bv != av: + changes.append(f" {param}: '{bv}' -> '{av}'") + +bt = before.get("tuned_profile", "") +at = after.get("tuned_profile", "") +if bt != at: + changes.append(f" tuned: {bt} -> {at}") + +if changes: + print(" Configuration changes:") + for c in changes: + print(c) +else: + print(" No configuration changes detected") +PYEOF +fi + +# Compare results +echo "" +python3 - "$BEFORE_DIR/summary.json" "$AFTER_DIR/summary.json" << 'PYEOF' +import sys, json + +with open(sys.argv[1]) as f: + before = json.load(f) +with open(sys.argv[2]) as f: + after = json.load(f) + +# Index by (model, backend, test) +def index_results(data): + idx = {} + for r in data.get("results", []): + key = (r["model"], r["backend"], r["test"]) + idx[key] = r["tokens_per_sec"] + return idx + +b_idx = index_results(before) +a_idx = index_results(after) + +all_keys = sorted(set(b_idx.keys()) | set(a_idx.keys())) + +if not all_keys: + print(" No comparable results found.") + sys.exit(0) + +fmt = " {:<18} {:<14} {:<7} {:>9} {:>9} {:>8}" +print(fmt.format("Model", "Backend", "Test", "Before", "After", "Delta")) +print(" " + "-" * 70) + +for key in all_keys: + model, backend, test = key + b_val = b_idx.get(key) + a_val = a_idx.get(key) + + b_str = f"{b_val:.1f}" if b_val else "—" + a_str = f"{a_val:.1f}" if a_val else "—" + + if b_val and a_val: + delta_pct = (a_val - b_val) / b_val * 100 + if delta_pct > 0: + d_str = f"\033[32m+{delta_pct:.1f}%\033[0m" + elif delta_pct < 0: + d_str = f"\033[31m{delta_pct:.1f}%\033[0m" + else: + d_str = "0.0%" + else: + d_str = "—" + + print(fmt.format(model[:18], backend[:14], test, b_str, a_str, d_str)) + +print() +PYEOF diff --git a/scripts/benchmark/run-baseline.sh b/scripts/benchmark/run-baseline.sh new file mode 100644 index 0000000..75d6531 --- /dev/null +++ b/scripts/benchmark/run-baseline.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +# Capture pre-optimization baseline benchmark +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +MODEL_DIR="$(data_dir models)" +TS="$(timestamp)" +RESULT_DIR="$(data_dir baselines)/$TS" +mkdir -p "$RESULT_DIR" + +REPS_STANDARD=5 +REPS_LONGCTX=3 + +log_header "Baseline Benchmark Capture" +log_info "Results will be saved to: $RESULT_DIR" + +# ── 1. Save system state ──────────────────────────────── +log_info "Capturing system state..." +bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null + +# ── 2. Discover available toolboxes and models ────────── +existing="$(detect_toolbox_names 2>/dev/null || true)" + +# Map toolbox names to llama-bench commands (same pattern as upstream) +declare -A BENCH_PATHS=( + [llama-vulkan-radv]="/usr/sbin/llama-bench" + [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" + [llama-rocm-6.4.4]="/usr/local/bin/llama-bench" + [llama-rocm-7.2]="/usr/local/bin/llama-bench" + [llama-rocm7-nightlies]="/usr/local/bin/llama-bench" +) + +available_backends=() +for tb in "${!BENCH_PATHS[@]}"; do + if echo "$existing" | grep -q "^${tb}$"; then + available_backends+=("$tb") + log_success "Backend: $tb" + fi +done + +if (( ${#available_backends[@]} == 0 )); then + log_error "No toolbox backends found. Run: make benchmark-setup" + exit 1 +fi + +# Find models +mapfile -t MODEL_PATHS < <( + find "$MODEL_DIR" -type f -name '*.gguf' \ + \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ + | sort +) + +if (( ${#MODEL_PATHS[@]} == 0 )); then + log_error "No GGUF models found in $MODEL_DIR. Run: make benchmark-setup" + exit 1 +fi + +log_info "Found ${#MODEL_PATHS[@]} model(s):" +for p in "${MODEL_PATHS[@]}"; do + printf " %s (%s)\n" "$(basename "$p")" "$(du -h "$p" | cut -f1)" +done + +# ── 3. Start metric logging ───────────────────────────── +METRICS_FILE="$RESULT_DIR/metrics.csv" +bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & +METRICS_PID=$! +log_info "Metric logger started (PID: $METRICS_PID)" + +cleanup() { + kill "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +} +trap cleanup EXIT + +# ── 4. Run benchmarks ─────────────────────────────────── +for MODEL_PATH in "${MODEL_PATHS[@]}"; do + MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" + + for BACKEND in "${available_backends[@]}"; do + BENCH_BIN="${BENCH_PATHS[$BACKEND]}" + BACKEND_SAFE="${BACKEND//[.-]/_}" + + # Build environment args for ROCm backends + ENV_ARGS=() + if [[ "$BACKEND" == *rocm* ]]; then + ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) + fi + + # Standard test (pp512 + tg128, default context) + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" + if [[ ! -s "$OUT" ]]; then + printf "\n${BOLD}>> [%s] %s — standard test${RESET}\n" "$BACKEND" "$MODEL_NAME" + CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD") + + printf " cmd: %s\n" "${CMD[*]}" + if "${CMD[@]}" > "$OUT" 2>&1; then + log_success "Standard test complete" + tail -5 "$OUT" + else + log_error "Standard test failed (exit $?)" + echo "FAILED" >> "$OUT" + fi + else + log_info "Skipping standard test (log exists): $OUT" + fi + + # Long-context test (pp2048, tg32, ctx 32768) + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" + if [[ ! -s "$OUT_LC" ]]; then + printf "\n${BOLD}>> [%s] %s — long-context test${RESET}\n" "$BACKEND" "$MODEL_NAME" + + UB_SIZE=2048 + [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 + + CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 + -p 2048 -n 32 -d 32768 -ub "$UB_SIZE" + -r "$REPS_LONGCTX") + + printf " cmd: %s\n" "${CMD_LC[*]}" + if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then + log_success "Long-context test complete" + tail -5 "$OUT_LC" + else + log_error "Long-context test failed (exit $?)" + echo "FAILED" >> "$OUT_LC" + fi + else + log_info "Skipping long-context test (log exists): $OUT_LC" + fi + done +done + +# ── 5. Parse results into summary JSON ────────────────── +log_info "Parsing results..." +SUMMARY="$RESULT_DIR/summary.json" + +python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF' +import sys, os, re, json +from pathlib import Path + +result_dir = Path(sys.argv[1]) +results = [] + +for logfile in sorted(result_dir.glob("*.log")): + content = logfile.read_text() + if "FAILED" in content: + continue + + # Parse the pipe-delimited llama-bench table + for line in content.splitlines(): + line = line.strip() + if not line.startswith("|") or "model" in line.lower() and "size" in line.lower(): + continue + if "---" in line: + continue + + parts = [p.strip() for p in line.split("|")] + if len(parts) < 10: + continue + + # Columns: | model | size | params | backend | ngl | fa | mmap | test | t/s | + try: + test_type = parts[8].strip() if len(parts) > 8 else "" + ts_raw = parts[9].strip() if len(parts) > 9 else "" + if not test_type or not ts_raw: + continue + + # Parse "548.18 +/- 1.59" or just "548.18" + ts_match = re.match(r'([\d.]+)', ts_raw) + if not ts_match: + continue + + results.append({ + "file": logfile.name, + "model": parts[1].strip(), + "size": parts[2].strip(), + "backend": parts[4].strip(), + "test": test_type, + "tokens_per_sec": float(ts_match.group(1)), + "raw": ts_raw, + }) + except (ValueError, IndexError): + continue + +print(json.dumps({"results": results}, indent=2)) +PYEOF + +# ── 6. Display summary ────────────────────────────────── +log_header "Baseline Results" + +python3 - "$SUMMARY" << 'PYEOF' +import sys, json + +with open(sys.argv[1]) as f: + data = json.load(f) + +if not data["results"]: + print(" No results parsed. Check log files for errors.") + sys.exit(0) + +# Print table +fmt = " {:<20} {:<16} {:<8} {:>10}" +print(fmt.format("Model", "Backend", "Test", "t/s")) +print(" " + "-" * 58) +for r in data["results"]: + print(fmt.format( + r["model"][:20], + r["backend"][:16], + r["test"], + f"{r['tokens_per_sec']:.2f}" + )) +PYEOF + +echo "" +log_success "Baseline saved to: $RESULT_DIR" +log_info "Files: system-state.json, summary.json, metrics.csv, *.log" +log_info "Compare later with: bin/benchmark compare $RESULT_DIR " diff --git a/scripts/benchmark/run-suite.sh b/scripts/benchmark/run-suite.sh new file mode 100644 index 0000000..e996cb8 --- /dev/null +++ b/scripts/benchmark/run-suite.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# Full benchmark suite — run all backends × models with tagging +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +MODEL_DIR="$(data_dir models)" +TAG="run" +BACKENDS_FILTER="" +MODELS_FILTER="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --tag|-t) TAG="$2"; shift 2 ;; + --backends|-b) BACKENDS_FILTER="$2"; shift 2 ;; + --models|-m) MODELS_FILTER="$2"; shift 2 ;; + *) shift ;; + esac +done + +TS="$(timestamp)" +RESULT_DIR="$(data_dir benchmarks)/${TAG}-${TS}" +mkdir -p "$RESULT_DIR" + +REPS_STANDARD=5 +REPS_LONGCTX=3 + +log_header "Benchmark Suite: $TAG" +log_info "Results: $RESULT_DIR" + +# Save system state +bash "$SCRIPT_DIR/../audit/system-report.sh" --json > "$RESULT_DIR/system-state.json" 2>/dev/null + +# Discover backends +existing="$(detect_toolbox_names 2>/dev/null || true)" + +declare -A BENCH_PATHS=( + [llama-vulkan-radv]="/usr/sbin/llama-bench" + [llama-vulkan-amdvlk]="/usr/sbin/llama-bench" + [llama-rocm-6.4.4]="/usr/local/bin/llama-bench" + [llama-rocm-7.2]="/usr/local/bin/llama-bench" + [llama-rocm7-nightlies]="/usr/local/bin/llama-bench" +) + +available_backends=() +for tb in "${!BENCH_PATHS[@]}"; do + if echo "$existing" | grep -q "^${tb}$"; then + if [[ -z "$BACKENDS_FILTER" ]] || echo "$BACKENDS_FILTER" | tr ',' '\n' | grep -q "$tb"; then + available_backends+=("$tb") + fi + fi +done + +if (( ${#available_backends[@]} == 0 )); then + log_error "No matching backends. Run: make benchmark-setup" + exit 1 +fi +log_info "Backends: ${available_backends[*]}" + +# Find models +mapfile -t MODEL_PATHS < <( + find "$MODEL_DIR" -type f -name '*.gguf' \ + \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ + | sort +) + +if [[ -n "$MODELS_FILTER" ]]; then + filtered=() + for p in "${MODEL_PATHS[@]}"; do + name="$(basename "$p")" + if echo "$MODELS_FILTER" | tr ',' '\n' | grep -qi "$name"; then + filtered+=("$p") + fi + done + MODEL_PATHS=("${filtered[@]}") +fi + +if (( ${#MODEL_PATHS[@]} == 0 )); then + log_error "No models found. Run: make benchmark-setup" + exit 1 +fi +log_info "Models: ${#MODEL_PATHS[@]}" + +# Start metric logging +METRICS_FILE="$RESULT_DIR/metrics.csv" +bash "$SCRIPT_DIR/../monitor/log-metrics.sh" --output "$METRICS_FILE" --interval 2 & +METRICS_PID=$! +trap 'kill "$METRICS_PID" 2>/dev/null; wait "$METRICS_PID" 2>/dev/null' EXIT + +# Run benchmarks (same logic as run-baseline.sh) +for MODEL_PATH in "${MODEL_PATHS[@]}"; do + MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" + + for BACKEND in "${available_backends[@]}"; do + BENCH_BIN="${BENCH_PATHS[$BACKEND]}" + BACKEND_SAFE="${BACKEND//[.-]/_}" + + ENV_ARGS=() + [[ "$BACKEND" == *rocm* ]] && ENV_ARGS=(env ROCBLAS_USE_HIPBLASLT=1) + + # Standard test + OUT="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1.log" + if [[ ! -s "$OUT" ]]; then + printf "\n${BOLD}>> [%s] %s — standard${RESET}\n" "$BACKEND" "$MODEL_NAME" + CMD=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 -r "$REPS_STANDARD") + if "${CMD[@]}" > "$OUT" 2>&1; then + log_success "Done"; tail -3 "$OUT" + else + log_error "Failed"; echo "FAILED" >> "$OUT" + fi + fi + + # Long-context test + OUT_LC="$RESULT_DIR/${MODEL_NAME}__${BACKEND_SAFE}__fa1__longctx32768.log" + if [[ ! -s "$OUT_LC" ]]; then + printf "\n${BOLD}>> [%s] %s — longctx${RESET}\n" "$BACKEND" "$MODEL_NAME" + UB_SIZE=2048; [[ "$BACKEND" == *vulkan* ]] && UB_SIZE=512 + CMD_LC=(toolbox run -c "$BACKEND" -- "${ENV_ARGS[@]}" "$BENCH_BIN" + -ngl 99 -mmp 0 -m "$MODEL_PATH" -fa 1 + -p 2048 -n 32 -d 32768 -ub "$UB_SIZE" -r "$REPS_LONGCTX") + if "${CMD_LC[@]}" > "$OUT_LC" 2>&1; then + log_success "Done"; tail -3 "$OUT_LC" + else + log_error "Failed"; echo "FAILED" >> "$OUT_LC" + fi + fi + done +done + +# Parse results +SUMMARY="$RESULT_DIR/summary.json" +# Parse llama-bench log files into summary JSON +python3 - "$RESULT_DIR" > "$SUMMARY" << 'PYEOF' +import sys, os, re, json +from pathlib import Path + +result_dir = Path(sys.argv[1]) +results = [] + +for logfile in sorted(result_dir.glob("*.log")): + content = logfile.read_text() + if "FAILED" in content: + continue + for line in content.splitlines(): + line = line.strip() + if not line.startswith("|") or "model" in line.lower() and "size" in line.lower(): + continue + if "---" in line: + continue + parts = [p.strip() for p in line.split("|")] + if len(parts) < 10: + continue + try: + test_type = parts[8].strip() + ts_raw = parts[9].strip() + ts_match = re.match(r'([\d.]+)', ts_raw) + if not ts_match: + continue + results.append({ + "file": logfile.name, + "model": parts[1].strip(), + "size": parts[2].strip(), + "backend": parts[4].strip(), + "test": test_type, + "tokens_per_sec": float(ts_match.group(1)), + "raw": ts_raw, + }) + except (ValueError, IndexError): + continue + +print(json.dumps({"results": results}, indent=2)) +PYEOF + +log_header "Results" +python3 - "$SUMMARY" << 'PYEOF' +import sys, json +with open(sys.argv[1]) as f: + data = json.load(f) +if not data["results"]: + print(" No results parsed.") + sys.exit(0) +fmt = " {:<20} {:<16} {:<8} {:>10}" +print(fmt.format("Model", "Backend", "Test", "t/s")) +print(" " + "-" * 58) +for r in data["results"]: + print(fmt.format(r["model"][:20], r["backend"][:16], r["test"], f"{r['tokens_per_sec']:.2f}")) +PYEOF + +echo "" +log_success "Results saved to: $RESULT_DIR" diff --git a/scripts/benchmark/setup.sh b/scripts/benchmark/setup.sh new file mode 100644 index 0000000..fb56c49 --- /dev/null +++ b/scripts/benchmark/setup.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# Benchmark setup — ensure toolboxes and test models are ready +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" + +TOOLBOXES_REPO="/data/workspace/projects/HomeLab/strix-halo-toolboxes/amd-strix-halo-llamacpp-toolboxes" +MODEL_DIR="$(data_dir models)" + +log_header "Benchmark Setup" + +# ── 1. Check toolbox containers ────────────────────────── +log_info "Checking toolbox containers..." + +# Minimum required: vulkan-radv (most stable) +REQUIRED_TOOLBOXES=("llama-vulkan-radv") +OPTIONAL_TOOLBOXES=("llama-rocm-6.4.4" "llama-rocm-7.2" "llama-vulkan-amdvlk") + +existing=$(detect_toolbox_names 2>/dev/null || true) +missing=() + +for tb in "${REQUIRED_TOOLBOXES[@]}"; do + if echo "$existing" | grep -q "^${tb}$"; then + log_success "Toolbox: $tb" + else + missing+=("$tb") + log_warn "Toolbox missing: $tb" + fi +done + +for tb in "${OPTIONAL_TOOLBOXES[@]}"; do + if echo "$existing" | grep -q "^${tb}$"; then + log_success "Toolbox: $tb (optional)" + else + log_info "Toolbox not present: $tb (optional)" + fi +done + +if (( ${#missing[@]} > 0 )); then + log_info "Need to create required toolboxes." + if [[ -d "$TOOLBOXES_REPO" ]]; then + log_info "Found toolboxes repo at: $TOOLBOXES_REPO" + if confirm "Create missing toolboxes using refresh-toolboxes.sh?"; then + for tb in "${missing[@]}"; do + log_info "Creating $tb..." + bash "$TOOLBOXES_REPO/refresh-toolboxes.sh" "$tb" + done + fi + else + log_error "Toolboxes repo not found at: $TOOLBOXES_REPO" + log_info "Clone it: git clone https://github.com/kyuz0/amd-strix-halo-toolboxes" + log_info "Then re-run this setup." + exit 1 + fi +fi + +# ── 2. Verify GPU access inside toolboxes ──────────────── +log_info "Verifying GPU access in toolboxes..." +for tb in "${REQUIRED_TOOLBOXES[@]}"; do + if echo "$existing" | grep -qF "$tb"; then + if toolbox run -c "$tb" -- llama-cli --list-devices 2>&1 | grep -qi "gpu\|vulkan\|rocm"; then + log_success "GPU accessible in $tb" + else + log_warn "GPU may not be accessible in $tb — check device mappings" + fi + fi +done + +# ── 3. Check for test models ──────────────────────────── +log_info "Checking for test models in $MODEL_DIR..." + +model_count=$(find "$MODEL_DIR" -name "*.gguf" 2>/dev/null | wc -l) +if (( model_count > 0 )); then + log_success "Found $model_count model(s):" + find "$MODEL_DIR" -name "*.gguf" | while read -r f; do + size=$(du -h "$f" | cut -f1) + printf " %s (%s)\n" "$(basename "$f")" "$size" + done +else + log_warn "No GGUF models found in $MODEL_DIR" + log_info "Download a test model. Example:" + echo "" + echo " # Small (4B, ~3 GB):" + echo " huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \\" + echo " --local-dir $MODEL_DIR" + echo "" + echo " # Medium (14B, ~9 GB):" + echo " huggingface-cli download Qwen/Qwen3-14B-GGUF Qwen3-14B-Q4_K_M.gguf \\" + echo " --local-dir $MODEL_DIR" + echo "" + + if is_cmd huggingface-cli; then + if confirm "Download Qwen3-4B Q4_K_M (~3 GB) as test model?"; then + huggingface-cli download Qwen/Qwen3-4B-GGUF Qwen3-4B-Q4_K_M.gguf \ + --local-dir "$MODEL_DIR" + log_success "Model downloaded" + fi + else + log_info "Install huggingface-cli: pip install huggingface_hub[cli]" + fi +fi + +log_header "Setup Complete" +log_info "Run 'make benchmark-baseline' to capture your baseline." diff --git a/scripts/monitor/dashboard.sh b/scripts/monitor/dashboard.sh new file mode 100644 index 0000000..06c1c96 --- /dev/null +++ b/scripts/monitor/dashboard.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# Tmux-based monitoring dashboard +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" + +SESSION="strix-monitor" +SIMPLE=false +WITH_LOG=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --simple|-s) SIMPLE=true; shift ;; + --with-logging|-l) WITH_LOG=true; shift ;; + *) shift ;; + esac +done + +# Simple mode: just launch amdgpu_top +if $SIMPLE; then + if is_cmd amdgpu_top; then + exec amdgpu_top + elif is_cmd nvtop; then + log_warn "amdgpu_top not found, falling back to nvtop" + exec nvtop + else + log_error "No GPU monitor installed. Run: make monitor-install" + exit 1 + fi +fi + +# Full dashboard requires tmux +if ! is_cmd tmux; then + log_error "tmux is required for dashboard mode. Run: make monitor-install" + exit 1 +fi + +# Pick GPU monitor +GPU_MON="nvtop" +if is_cmd amdgpu_top; then + GPU_MON="amdgpu_top" +fi + +# Pick system monitor +SYS_MON="htop" +if is_cmd btop; then + SYS_MON="btop" +elif ! is_cmd htop; then + SYS_MON="top" +fi + +# Kill existing session if running +tmux kill-session -t "$SESSION" 2>/dev/null || true + +# Start background logging if requested +LOG_CMD="echo Metric logging not active. Use --with-logging to enable.; read -r" +LOG_PID="" +if $WITH_LOG; then + LOG_FILE="$(data_dir logs)/metrics-$(timestamp).csv" + bash "$SCRIPT_DIR/log-metrics.sh" --output "$LOG_FILE" & + LOG_PID=$! + LOG_CMD="tail -f \"$LOG_FILE\"" + log_info "Metric logger started (PID: $LOG_PID) → $LOG_FILE" +fi + +# Cleanup logger on exit +cleanup() { + if [[ -n "$LOG_PID" ]]; then + kill "$LOG_PID" 2>/dev/null || true + wait "$LOG_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT + +# Create tmux layout +# +--------------------+--------------------+ +# | GPU monitor | System monitor | +# | | | +# +--------------------------------------------+ +# | Metrics log tail / status | +# +--------------------------------------------+ +tmux new-session -d -s "$SESSION" -x "$(tput cols 2>/dev/null || echo 120)" -y "$(tput lines 2>/dev/null || echo 40)" "$GPU_MON" +tmux split-window -t "$SESSION" -h "$SYS_MON" +tmux split-window -t "$SESSION" -v -p 20 "$LOG_CMD" +tmux select-pane -t "$SESSION:0.0" + +log_info "Dashboard started. Attach with: tmux attach -t $SESSION" +log_info "Detach with Ctrl+B then D. Kill with: tmux kill-session -t $SESSION" +tmux attach -t "$SESSION" diff --git a/scripts/monitor/install-tools.sh b/scripts/monitor/install-tools.sh new file mode 100644 index 0000000..23eb1f2 --- /dev/null +++ b/scripts/monitor/install-tools.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# Install monitoring tools for Strix Halo +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" + +log_header "Monitoring Tools Installer" + +# ── amdgpu_top (most important) ───────────────────────── +if is_cmd amdgpu_top; then + log_success "amdgpu_top already installed: $(amdgpu_top --version 2>&1 | head -1)" +else + log_info "Installing amdgpu_top (best AMD GPU monitor)..." + installed=false + + # Method 1: Install RPM from GitHub releases (fastest, works on Fedora) + if ! $installed; then + log_info "Downloading pre-built RPM from GitHub releases..." + AMDGPU_TOP_VERSION="0.11.2" + RPM_URL="https://github.com/Umio-Yasuno/amdgpu_top/releases/download/v${AMDGPU_TOP_VERSION}/amdgpu_top-${AMDGPU_TOP_VERSION}-1.x86_64.rpm" + RPM_FILE="/tmp/amdgpu_top-${AMDGPU_TOP_VERSION}.rpm" + + if curl -fsSL -o "$RPM_FILE" "$RPM_URL" 2>/dev/null; then + if sudo dnf install -y "$RPM_FILE" 2>&1; then + installed=true + log_success "amdgpu_top installed from RPM" + rm -f "$RPM_FILE" + else + log_warn "RPM install failed" + fi + else + log_warn "RPM download failed" + fi + fi + + # Method 2: Try dnf repos + if ! $installed; then + log_info "Trying dnf repos..." + if sudo dnf install -y amdgpu_top 2>/dev/null; then + installed=true + log_success "amdgpu_top installed via dnf" + fi + fi + + # Method 3: cargo (if available) + if ! $installed && is_cmd cargo; then + log_info "Building from source via cargo..." + if cargo install amdgpu_top 2>&1; then + installed=true + log_success "amdgpu_top installed via cargo" + else + log_warn "cargo install failed" + fi + fi + + if ! $installed; then + log_warn "Could not install amdgpu_top automatically." + log_info "Manual options:" + log_info " 1. Download RPM: curl -LO $RPM_URL && sudo dnf install ./amdgpu_top-*.rpm" + log_info " 2. Download AppImage: https://github.com/Umio-Yasuno/amdgpu_top/releases/latest" + fi +fi + +# ── btop ───────────────────────────────────────────────── +if is_cmd btop; then + log_success "btop already installed" +else + log_info "Installing btop..." + if sudo dnf install -y btop 2>&1; then + log_success "btop installed" + else + log_warn "Could not install btop via dnf" + fi +fi + +# ── tmux (needed for dashboard) ────────────────────────── +if is_cmd tmux; then + log_success "tmux already installed" +else + log_info "Installing tmux..." + if sudo dnf install -y tmux 2>&1; then + log_success "tmux installed" + else + log_warn "Could not install tmux via dnf" + fi +fi + +# ── Verify existing tools ─────────────────────────────── +log_header "Monitoring Tools Status" +for tool in amdgpu_top nvtop btop amd-smi rocm-smi tmux; do + if is_cmd "$tool"; then + log_success "$tool" + else + log_warn "$tool — not installed" + fi +done diff --git a/scripts/monitor/log-metrics.sh b/scripts/monitor/log-metrics.sh new file mode 100644 index 0000000..7b48360 --- /dev/null +++ b/scripts/monitor/log-metrics.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# Background metric collector — samples GPU and system stats to CSV +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" + +INTERVAL=2 +OUTPUT="" +DURATION=0 # 0 = indefinite + +while [[ $# -gt 0 ]]; do + case "$1" in + --output|-o) OUTPUT="$2"; shift 2 ;; + --interval|-i) INTERVAL="$2"; shift 2 ;; + --duration|-d) DURATION="$2"; shift 2 ;; + --help|-h) + echo "Usage: log-metrics.sh [--output FILE] [--interval SECS] [--duration SECS]" + exit 0 ;; + *) log_warn "Unknown argument: $1"; shift ;; + esac +done + +# Validate numeric args +[[ "$INTERVAL" =~ ^[0-9]+$ ]] || { log_error "--interval must be a positive integer"; exit 1; } +[[ "$DURATION" =~ ^[0-9]+$ ]] || { log_error "--duration must be a positive integer"; exit 1; } + +if [[ -z "$OUTPUT" ]]; then + OUTPUT="$(data_dir logs)/metrics-$(timestamp).csv" +fi + +mkdir -p "$(dirname "$OUTPUT")" + +# Cache sysfs paths once (avoid re-globbing every iteration) +SYSFS_GPU_BUSY="$GPU_SYSFS/gpu_busy_percent" +SYSFS_VRAM_USED="$GPU_SYSFS/mem_info_vram_used" +SYSFS_GTT_USED="$GPU_SYSFS/mem_info_gtt_used" +SYSFS_TEMP="" +SYSFS_POWER="" +for f in "$GPU_SYSFS"/hwmon/hwmon*/temp1_input; do + [[ -f "$f" ]] && SYSFS_TEMP="$f" && break +done +for f in "$GPU_SYSFS"/hwmon/hwmon*/power1_average; do + [[ -f "$f" ]] && SYSFS_POWER="$f" && break +done + +# Write CSV header +echo "timestamp,gpu_busy_pct,vram_used_mib,gtt_used_mib,gpu_temp_c,gpu_power_w,cpu_pct,ram_used_mib" > "$OUTPUT" + +log_info "Logging metrics every ${INTERVAL}s → $OUTPUT" +[[ $DURATION -gt 0 ]] && log_info "Will stop after ${DURATION}s" + +start_time=$SECONDS +stopped=false + +cleanup() { + $stopped && return + stopped=true + local lines + lines=$(( $(wc -l < "$OUTPUT") - 1 )) + log_info "Metric logger stopped. $lines samples in $OUTPUT" +} +trap cleanup EXIT + +# Read /proc/stat fields into variables using bash builtins +read_cpu_stat() { + local line + read -r line < /proc/stat + # "cpu user nice system idle iowait irq softirq steal" + set -- $line + shift # drop "cpu" + CPU_TOTAL=$(( $1 + $2 + $3 + $4 + $5 + $6 + $7 + ${8:-0} )) + CPU_IDLE=$4 +} + +while true; do + ts="$(printf '%(%Y-%m-%d %H:%M:%S)T' -1)" + + # GPU metrics — direct reads, no subshells + read -r gpu_busy < "$SYSFS_GPU_BUSY" 2>/dev/null || gpu_busy=0 + read -r vram_bytes < "$SYSFS_VRAM_USED" 2>/dev/null || vram_bytes=0 + read -r gtt_bytes < "$SYSFS_GTT_USED" 2>/dev/null || gtt_bytes=0 + read -r temp_mc < "$SYSFS_TEMP" 2>/dev/null || temp_mc=0 + read -r power_uw < "$SYSFS_POWER" 2>/dev/null || power_uw=0 + + vram_mib=$(( vram_bytes / 1048576 )) + gtt_mib=$(( gtt_bytes / 1048576 )) + gpu_temp_c=$(( temp_mc / 1000 )).$(( (temp_mc % 1000) / 100 )) + gpu_power_w=$(( power_uw / 1000000 )).$(( (power_uw % 1000000) / 100000 )) + + # CPU usage (snapshot delta) + read_cpu_stat + prev_total=$CPU_TOTAL + prev_idle=$CPU_IDLE + sleep 0.1 + read_cpu_stat + delta_total=$(( CPU_TOTAL - prev_total )) + delta_idle=$(( CPU_IDLE - prev_idle )) + if (( delta_total > 0 )); then + cpu_pct=$(( (delta_total - delta_idle) * 1000 / delta_total )) + # Format N as N/10 . N%10, handling single-digit values (e.g., 5 → 0.5) + cpu_pct_fmt="$(( cpu_pct / 10 )).$(( cpu_pct % 10 ))" + else + cpu_pct_fmt="0.0" + fi + + # RAM used (bash builtins only) + local_mem_total=0 + local_mem_avail=0 + while IFS=': ' read -r key val _; do + case "$key" in + MemTotal) local_mem_total=$val ;; + MemAvailable) local_mem_avail=$val; break ;; + esac + done < /proc/meminfo + ram_used_mib=$(( (local_mem_total - local_mem_avail) / 1024 )) + + echo "$ts,$gpu_busy,$vram_mib,$gtt_mib,$gpu_temp_c,$gpu_power_w,$cpu_pct_fmt,$ram_used_mib" >> "$OUTPUT" + + # Check duration + if (( DURATION > 0 && SECONDS - start_time >= DURATION )); then + break + fi + + sleep "$INTERVAL" +done diff --git a/scripts/optimize/kernel-params.sh b/scripts/optimize/kernel-params.sh new file mode 100644 index 0000000..e81e79a --- /dev/null +++ b/scripts/optimize/kernel-params.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Configure kernel boot parameters for unified memory optimization +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +GRUB_FILE="/etc/default/grub" + +log_header "Kernel Boot Parameter Optimization" + +# ── Check root early ──────────────────────────────────── +if [[ $EUID -ne 0 ]]; then + log_error "This script requires root. Re-run with: sudo make optimize-kernel" + exit 1 +fi + +# ── Show current state ─────────────────────────────────── +log_info "Current kernel command line:" +printf " ${DIM}%s${RESET}\n" "$(cat /proc/cmdline)" +echo "" + +param_iommu="$(detect_kernel_param 'iommu')" +param_gttsize="$(detect_gttsize_param)" +param_pages="$(detect_pages_limit_param)" + +rec_gttsize="$(recommended_gttsize_mib)" +rec_pages="$(recommended_pages_limit)" + +# ── Check what's needed ────────────────────────────────── +needs_change=false + +log_info "Parameter status:" + +if [[ "$param_iommu" == "pt" ]]; then + print_status pass "iommu=pt" "already set" +else + print_status fail "iommu=pt" "$([ -n "$param_iommu" ] && echo "current: $param_iommu" || echo "missing")" + needs_change=true +fi + +if [[ -n "$param_gttsize" ]] && (( param_gttsize >= rec_gttsize )); then + print_status pass "amdgpu.gttsize" "current: $param_gttsize MiB" +else + print_status fail "amdgpu.gttsize" "$([ -n "$param_gttsize" ] && echo "current: $param_gttsize MiB, " || echo "missing, ")recommended: $rec_gttsize MiB (~$(human_mib "$rec_gttsize"))" + needs_change=true +fi + +if [[ -n "$param_pages" ]] && (( param_pages >= rec_pages )); then + print_status pass "ttm.pages_limit" "current: $param_pages" +else + print_status fail "ttm.pages_limit" "$([ -n "$param_pages" ] && echo "current: $param_pages, " || echo "missing, ")recommended: $rec_pages" + needs_change=true +fi + +if ! $needs_change; then + echo "" + log_success "All kernel parameters are already optimal!" + exit 0 +fi + +# ── Explain what we're doing ───────────────────────────── +echo "" +log_info "These parameters enable unified memory for the integrated GPU:" +echo " iommu=pt IOMMU passthrough — reduces memory access latency" +echo " amdgpu.gttsize=$rec_gttsize GPU can dynamically access ~$(human_mib "$rec_gttsize") system RAM" +echo " ttm.pages_limit=$rec_pages Pin limit for GPU memory pages ($(human_mib "$rec_gttsize") in 4K pages)" +echo "" + +# ── Apply changes ──────────────────────────────────────── +if ! confirm "Apply these kernel parameters to GRUB?"; then + log_info "Skipped. You can apply manually by editing $GRUB_FILE" + exit 0 +fi + +# Backup +BACKUP_DIR="$(data_dir backups)" +backup_file="$BACKUP_DIR/grub-$(timestamp).bak" +cp "$GRUB_FILE" "$backup_file" +log_success "GRUB backup saved: $backup_file" + +# Parse current GRUB_CMDLINE_LINUX using Python (data via env vars, not interpolation) +current_cmdline="$(GRUB_PATH="$GRUB_FILE" python3 -c ' +import re, os +with open(os.environ["GRUB_PATH"]) as f: + for line in f: + m = re.match(r"^GRUB_CMDLINE_LINUX=\"(.*)\"", line) + if m: + print(m.group(1)) + raise SystemExit(0) +print("") +')" + +# Remove any existing values of these params +new_cmdline="$current_cmdline" +new_cmdline="$(echo "$new_cmdline" | sed -E 's/\biommu=[^ ]*//g')" +new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bamd_iommu=[^ ]*//g')" +new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bamdgpu\.gttsize=[^ ]*//g')" +new_cmdline="$(echo "$new_cmdline" | sed -E 's/\bttm\.pages_limit=[^ ]*//g')" +# Clean up extra spaces +new_cmdline="$(echo "$new_cmdline" | xargs)" + +# Add new params +new_cmdline="$new_cmdline iommu=pt amdgpu.gttsize=$rec_gttsize ttm.pages_limit=$rec_pages" + +log_info "GRUB_CMDLINE_LINUX change:" +printf " ${RED}Before:${RESET} %s\n" "$current_cmdline" +printf " ${GREEN}After:${RESET} %s\n" "$new_cmdline" +echo "" + +if ! confirm "Write this change?"; then + log_info "Aborted. Backup remains at: $backup_file" + exit 0 +fi + +# Apply using Python (all data via env vars — no shell interpolation into Python code) +GRUB_PATH="$GRUB_FILE" NEW_CMDLINE="$new_cmdline" python3 -c ' +import re, os +grub_path = os.environ["GRUB_PATH"] +new_line = "GRUB_CMDLINE_LINUX=\"" + os.environ["NEW_CMDLINE"] + "\"" +with open(grub_path) as f: + content = f.read() +content = re.sub(r"^GRUB_CMDLINE_LINUX=.*", new_line, content, count=1, flags=re.MULTILINE) +with open(grub_path, "w") as f: + f.write(content) +' +log_success "GRUB config updated" + +# Regenerate GRUB — prefer grubby on modern Fedora (BLS), fall back to grub2-mkconfig +log_info "Regenerating boot configuration..." +if is_cmd grubby; then + grubby --update-kernel=ALL --args="iommu=pt amdgpu.gttsize=$rec_gttsize ttm.pages_limit=$rec_pages" + log_success "Boot entries updated via grubby" +elif [[ -d /boot/grub2 ]]; then + grub2-mkconfig -o /boot/grub2/grub.cfg + log_success "GRUB regenerated via grub2-mkconfig" +elif [[ -d /boot/grub ]]; then + grub-mkconfig -o /boot/grub/grub.cfg + log_success "GRUB regenerated via grub-mkconfig" +else + log_error "Could not find grubby or grub config directory. Regenerate manually." + exit 1 +fi + +echo "" +log_warn "REBOOT REQUIRED for kernel parameters to take effect." +log_info "After reboot, verify with: make audit" diff --git a/scripts/optimize/rollback.sh b/scripts/optimize/rollback.sh new file mode 100644 index 0000000..5eeb758 --- /dev/null +++ b/scripts/optimize/rollback.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Rollback optimization changes +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" + +GRUB_FILE="/etc/default/grub" +BACKUP_DIR="$(data_dir backups)" + +log_header "Rollback Optimizations" + +# ── 1. GRUB rollback ──────────────────────────────────── +log_info "GRUB backups:" +mapfile -t grub_backups < <(find "$BACKUP_DIR" -name 'grub-*.bak' -print 2>/dev/null | sort -r) + +if (( ${#grub_backups[@]} == 0 )); then + log_info " No GRUB backups found" +else + for i in "${!grub_backups[@]}"; do + printf " [%d] %s\n" "$i" "${grub_backups[$i]}" + done + echo "" + + if confirm "Restore most recent GRUB backup?"; then + require_root + backup="${grub_backups[0]}" + cp "$backup" "$GRUB_FILE" + log_success "GRUB restored from: $backup" + + log_info "Regenerating boot configuration..." + if is_cmd grubby; then + # On BLS systems, also need to remove args via grubby + grubby --update-kernel=ALL --remove-args="iommu amdgpu.gttsize ttm.pages_limit" 2>/dev/null || true + log_success "Boot entries updated via grubby" + elif [[ -d /boot/grub2 ]]; then + grub2-mkconfig -o /boot/grub2/grub.cfg + log_success "GRUB regenerated via grub2-mkconfig" + elif [[ -d /boot/grub ]]; then + grub-mkconfig -o /boot/grub/grub.cfg + log_success "GRUB regenerated via grub-mkconfig" + else + log_error "Could not find grubby or grub config directory. Regenerate manually." + fi + log_warn "Reboot required for changes to take effect." + fi +fi + +# ── 2. Tuned profile rollback ─────────────────────────── +prev_profile_file="$BACKUP_DIR/tuned-previous-profile.txt" +if [[ -f "$prev_profile_file" ]]; then + prev_profile="$(cat "$prev_profile_file")" + current="$(tuned-adm active 2>/dev/null | sed 's/Current active profile: //' || echo "unknown")" + log_info "Tuned profile: $current (previous: $prev_profile)" + + if [[ "$current" != "$prev_profile" ]] && confirm "Restore tuned profile to $prev_profile?"; then + sudo tuned-adm profile "$prev_profile" + log_success "Tuned profile restored to: $prev_profile" + fi +else + log_info "No previous tuned profile saved" +fi + +# ── 3. BIOS reminder ──────────────────────────────────── +echo "" +log_warn "BIOS VRAM changes cannot be rolled back automatically." +log_info "To revert: Reboot → F10 → Advanced → UMA Frame Buffer Size → restore previous value" diff --git a/scripts/optimize/tuned-profile.sh b/scripts/optimize/tuned-profile.sh new file mode 100644 index 0000000..21a5f3f --- /dev/null +++ b/scripts/optimize/tuned-profile.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Switch tuned profile to accelerator-performance +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" + +RECOMMENDED="accelerator-performance" + +log_header "Tuned Profile Optimization" + +if ! is_cmd tuned-adm; then + log_error "tuned is not installed. Install with: sudo dnf install tuned" + exit 1 +fi + +current="$(detect_tuned_profile)" +log_info "Current profile: $current" + +if [[ "$current" == "$RECOMMENDED" ]]; then + log_success "Already using $RECOMMENDED" + exit 0 +fi + +# Check availability +if ! tuned-adm list 2>/dev/null | grep -q "$RECOMMENDED"; then + log_error "$RECOMMENDED profile not available" + log_info "Available profiles:" + tuned-adm list 2>/dev/null | grep "^-" | sed 's/^/ /' + exit 1 +fi + +echo "" +log_info "Recommended: $RECOMMENDED" +log_info "Description: Throughput performance with disabled higher latency STOP states" +log_info "Benefit: 5-8% improvement in prompt processing (pp) benchmarks" +log_info "No reboot required." +echo "" + +if ! confirm "Switch to $RECOMMENDED?"; then + log_info "Skipped" + exit 0 +fi + +# Save current for rollback +echo "$current" > "$(data_dir backups)/tuned-previous-profile.txt" + +sudo tuned-adm profile "$RECOMMENDED" + +new_profile="$(detect_tuned_profile)" +if [[ "$new_profile" == "$RECOMMENDED" ]]; then + log_success "Profile switched to: $new_profile" +else + log_error "Profile switch may have failed. Current: $new_profile" +fi diff --git a/scripts/optimize/verify.sh b/scripts/optimize/verify.sh new file mode 100644 index 0000000..e31577e --- /dev/null +++ b/scripts/optimize/verify.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# Post-optimization verification checklist +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +log_header "Optimization Verification" + +score=0 +total=0 + +check() { + local pass="$1" label="$2" detail="$3" + total=$(( total + 1 )) + if [[ "$pass" == "1" ]]; then + score=$(( score + 1 )) + print_status pass "$label" "$detail" + else + print_status fail "$label" "$detail" + fi +} + +# Kernel version +kernel="$(detect_kernel_version)" +kernel_major=$(echo "$kernel" | cut -d. -f1) +kernel_minor=$(echo "$kernel" | cut -d. -f2) +kernel_ok=0 +(( kernel_major > 6 || (kernel_major == 6 && kernel_minor >= 18) )) && kernel_ok=1 +check "$kernel_ok" "Kernel >= 6.18.4" "$kernel" + +# Firmware +fw_ok=1 +detect_firmware_bad && fw_ok=0 +check "$fw_ok" "Firmware (not 20251125)" "$(detect_firmware_version)" + +# Kernel params +iommu_val="$(detect_kernel_param 'iommu')" +iommu_ok=0 +[[ "$iommu_val" == "pt" ]] && iommu_ok=1 +check "$iommu_ok" "iommu=pt" "${iommu_val:-not set}" + +gttsize="$(detect_gttsize_param)" +rec_gttsize="$(recommended_gttsize_mib)" +gtt_ok=0 +[[ -n "$gttsize" ]] && (( gttsize >= rec_gttsize )) && gtt_ok=1 +check "$gtt_ok" "amdgpu.gttsize" "${gttsize:-not set} (recommended: $rec_gttsize)" + +pages="$(detect_pages_limit_param)" +rec_pages="$(recommended_pages_limit)" +pages_ok=0 +[[ -n "$pages" ]] && (( pages >= rec_pages )) && pages_ok=1 +check "$pages_ok" "ttm.pages_limit" "${pages:-not set} (recommended: $rec_pages)" + +# Tuned profile +tuned="$(detect_tuned_profile)" +tuned_ok=0 +[[ "$tuned" == "accelerator-performance" ]] && tuned_ok=1 +check "$tuned_ok" "Tuned profile" "$tuned" + +# VRAM (should be <= 1 GiB) +vram="$(detect_vram_total)" +vram_gib=$(echo "scale=1; $vram / 1073741824" | bc) +vram_ok=0 +(( vram <= 1073741824 )) && vram_ok=1 +check "$vram_ok" "VRAM <= 1 GiB" "${vram_gib} GiB" + +# GTT (should be close to recommended) +gtt="$(detect_gtt_total)" +gtt_gib=$(echo "scale=1; $gtt / 1073741824" | bc) +rec_gtt_bytes=$(( rec_gttsize * 1048576 )) +gtt_mem_ok=0 +(( gtt >= rec_gtt_bytes * 3 / 4 )) && gtt_mem_ok=1 +check "$gtt_mem_ok" "GTT >= $(human_mib "$rec_gttsize")" "${gtt_gib} GiB" + +# GPU monitor installed +monitor_ok=0 +is_cmd amdgpu_top && monitor_ok=1 +check "$monitor_ok" "amdgpu_top installed" "$(is_cmd amdgpu_top && echo 'yes' || echo 'no — run make monitor-install')" + +# Summary +echo "" +print_divider +printf "\n ${BOLD}Score: %d / %d${RESET}\n" "$score" "$total" + +if (( score == total )); then + printf " ${GREEN}Fully optimized!${RESET} Run 'make benchmark' to measure performance.\n" +elif (( score >= total * 3 / 4 )); then + printf " ${YELLOW}Nearly there${RESET} — check the failed items above.\n" +elif (( score >= total / 2 )); then + printf " ${YELLOW}Partially optimized${RESET} — run 'make optimize' for the remaining items.\n" +else + printf " ${RED}Significant optimizations pending${RESET} — run 'make optimize'\n" +fi +echo "" diff --git a/scripts/optimize/vram-gtt.sh b/scripts/optimize/vram-gtt.sh new file mode 100644 index 0000000..06bb21d --- /dev/null +++ b/scripts/optimize/vram-gtt.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# BIOS VRAM guidance + GTT verification +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../../lib/common.sh" +source "$SCRIPT_DIR/../../lib/detect.sh" +source "$SCRIPT_DIR/../../lib/format.sh" + +log_header "VRAM / GTT Memory Optimization" + +vram_total="$(detect_vram_total)" +gtt_total="$(detect_gtt_total)" +vram_gib=$(echo "scale=1; $vram_total / 1073741824" | bc) +gtt_gib=$(echo "scale=1; $gtt_total / 1073741824" | bc) + +log_info "Current memory allocation:" +print_kv "VRAM (dedicated)" "${vram_gib} GiB" +print_kv "GTT (dynamic)" "${gtt_gib} GiB" +print_kv "System RAM (visible)" "$(echo "scale=1; $(detect_system_ram_kb) / 1048576" | bc) GiB" +echo "" + +# ── Check if BIOS VRAM change is needed ────────────────── +# Optimal: VRAM <= 1 GiB (0.5 GiB ideal), rest dynamically via GTT +if (( vram_total > 1073741824 )); then + log_warn "VRAM is ${vram_gib} GiB — this permanently locks memory away from the OS." + log_info "AMD recommends 512 MB dedicated VRAM for Strix Halo." + log_info "The GPU accesses additional memory dynamically via GTT (kernel params)." + echo "" + + printf "${BOLD}BIOS Configuration Steps (HP ZBook Ultra G1a):${RESET}\n" + echo "" + echo " 1. Reboot the laptop" + echo " 2. Press F10 repeatedly during boot to enter BIOS Setup" + echo " 3. Navigate to: Advanced > Built-in Device Options" + echo " (or Advanced > Display > UMA Frame Buffer Size)" + echo " 4. Set UMA Frame Buffer Size to: 512 MB (or smallest available)" + echo " 5. Save and Exit (F10)" + echo "" + echo " NOTE: The exact menu path may vary by BIOS version." + echo " Look for 'UMA', 'Frame Buffer', 'VRAM', or 'iGPU Memory'." + echo "" + + log_info "After BIOS change + reboot with kernel params, expected state:" + echo " VRAM: ~512 MiB" + echo " GTT: ~$(human_mib "$(recommended_gttsize_mib)") (with kernel params)" + echo " System RAM: ~$(echo "scale=1; $(detect_total_physical_ram_kb) / 1048576 - 0.5" | bc) GiB visible" + echo "" + +elif (( vram_total <= 1073741824 )); then + log_success "VRAM is ${vram_gib} GiB — already optimal!" +fi + +# ── Check GTT ──────────────────────────────────────────── +rec_gttsize="$(recommended_gttsize_mib)" +rec_gtt_bytes=$(( rec_gttsize * 1048576 )) + +if (( gtt_total >= rec_gtt_bytes * 3 / 4 )); then + log_success "GTT is ${gtt_gib} GiB — good (recommended: ~$(human_mib "$rec_gttsize"))" +else + log_warn "GTT is ${gtt_gib} GiB — low (recommended: ~$(human_mib "$rec_gttsize"))" + log_info "This requires kernel boot parameters. Run: make optimize-kernel" +fi + +# ── Optional: amd-debug-tools ──────────────────────────── +echo "" +log_header "Optional: amd-debug-tools (amd-ttm)" +log_info "AMD provides 'amd-debug-tools' for runtime GTT/TTM inspection." + +if is_cmd amd-ttm; then + log_success "amd-ttm is installed" + log_info "Current GTT settings:" + amd-ttm 2>/dev/null || true +elif is_cmd pipx; then + log_info "Install with: pipx install amd-debug-tools" + if confirm "Install amd-debug-tools via pipx?"; then + pipx install amd-debug-tools + log_success "Installed. Run 'amd-ttm' to inspect GTT allocation." + fi +else + log_info "Install pipx first: sudo dnf install pipx" + log_info "Then: pipx install amd-debug-tools" +fi