Initial commit

This commit is contained in:
Felipe Cardoso
2026-03-25 20:13:15 +01:00
commit c596e38e9e
26 changed files with 2345 additions and 0 deletions

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env bash
# Quick-glance system audit — single screen status overview
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
# ── Gather data ──────────────────────────────────────────
cpu_model="$(detect_cpu_model)"
cpu_threads="$(detect_cpu_cores)"
cpu_physical="$(detect_cpu_physical)"
gpu_name="$(detect_gpu_name)"
kernel="$(detect_kernel_version)"
firmware="$(detect_firmware_version)"
vram_total="$(detect_vram_total)"
vram_used="$(detect_vram_used)"
gtt_total="$(detect_gtt_total)"
gtt_used="$(detect_gtt_used)"
ram_kb="$(detect_system_ram_kb)"
ram_bytes=$(( ram_kb * 1024 ))
param_iommu="$(detect_kernel_param 'iommu')"
param_gttsize="$(detect_gttsize_param)"
param_pages="$(detect_pages_limit_param)"
tuned="$(detect_tuned_profile)"
rocm_ver="$(detect_rocm_version)"
vulkan_drv="$(detect_vulkan_driver)"
vulkan_ver="$(detect_vulkan_version)"
rec_gttsize="$(recommended_gttsize_mib)"
rec_pages="$(recommended_pages_limit)"
# ── Score tracking ───────────────────────────────────────
score=0
total=0
check() {
local pass="$1" label="$2" detail="$3"
total=$(( total + 1 ))
if [[ "$pass" == "1" ]]; then
score=$(( score + 1 ))
print_status pass "$label" "$detail"
else
print_status fail "$label" "$detail"
fi
}
check_warn() {
local label="$1" detail="$2"
print_status warn "$label" "$detail"
}
check_info() {
local label="$1" detail="$2"
print_status info "$label" "$detail"
}
# ── Output ───────────────────────────────────────────────
printf "\n${BOLD}${CYAN}"
cat << 'BANNER'
╔═══════════════════════════════════════════╗
║ AMD Strix Halo — System Status ║
╚═══════════════════════════════════════════╝
BANNER
printf "${RESET}"
# Hardware
log_header "Hardware"
print_kv "CPU" "$cpu_model (${cpu_physical}C/${cpu_threads}T)"
print_kv "GPU" "$gpu_name"
print_kv "System RAM (visible)" "$(human_bytes "$ram_bytes")"
# Kernel & Firmware
log_header "Kernel & Firmware"
kernel_major=$(echo "$kernel" | cut -d. -f1)
kernel_minor=$(echo "$kernel" | cut -d. -f2)
kernel_ok=0
if (( kernel_major > 6 )) || (( kernel_major == 6 && kernel_minor >= 18 )); then
kernel_ok=1
fi
check "$kernel_ok" "Kernel version" "$kernel (need >= 6.18.4)"
firmware_ok=1
firmware_note="$firmware"
if detect_firmware_bad; then
firmware_ok=0
firmware_note="$firmware (KNOWN BAD — causes ROCm crashes!)"
fi
check "$firmware_ok" "Firmware" "$firmware_note"
# Memory allocation
log_header "Memory Allocation"
vram_gib=$(echo "scale=1; $vram_total / 1073741824" | bc)
gtt_gib=$(echo "scale=1; $gtt_total / 1073741824" | bc)
# VRAM: should be <= 1 GiB (ideally 0.5 GiB)
vram_ok=0
(( vram_total <= 1073741824 )) && vram_ok=1
check "$vram_ok" "VRAM (dedicated)" "${vram_gib} GiB$([ "$vram_ok" -eq 0 ] && echo " — should be 0.5 GiB in BIOS")"
# GTT: should be close to recommended (at least 75%)
gtt_rec_bytes=$(( rec_gttsize * 1048576 ))
gtt_ok=0
(( gtt_total >= gtt_rec_bytes * 3 / 4 )) && gtt_ok=1
check "$gtt_ok" "GTT (dynamic)" "${gtt_gib} GiB$([ "$gtt_ok" -eq 0 ] && echo " — should be ~$(human_mib "$rec_gttsize") with kernel params")"
print_kv "VRAM in use" "$(human_bytes "$vram_used")"
print_kv "GTT in use" "$(human_bytes "$gtt_used")"
# Kernel boot parameters
log_header "Kernel Boot Parameters"
iommu_ok=0
[[ "$param_iommu" == "pt" ]] && iommu_ok=1
check "$iommu_ok" "iommu=pt" "$([ -n "$param_iommu" ] && echo "current: $param_iommu" || echo "MISSING")"
gtt_param_ok=0
[[ -n "$param_gttsize" ]] && gtt_param_ok=1
check "$gtt_param_ok" "amdgpu.gttsize" "$([ -n "$param_gttsize" ] && echo "current: ${param_gttsize} MiB" || echo "MISSING — recommended: ${rec_gttsize}")"
pages_ok=0
[[ -n "$param_pages" ]] && pages_ok=1
check "$pages_ok" "ttm.pages_limit" "$([ -n "$param_pages" ] && echo "current: $param_pages" || echo "MISSING — recommended: ${rec_pages}")"
# Tuned profile
log_header "Performance Profile"
tuned_ok=0
[[ "$tuned" == "accelerator-performance" ]] && tuned_ok=1
check "$tuned_ok" "Tuned profile" "$tuned$([ "$tuned_ok" -eq 0 ] && echo " — recommended: accelerator-performance")"
# Software stack
log_header "Software Stack"
check_info "ROCm" "$rocm_ver"
check_info "Vulkan" "$vulkan_drv $vulkan_ver"
# Toolboxes
toolbox_count=0
if is_cmd toolbox; then
toolbox_count=$(detect_toolbox_names | wc -l)
fi
if (( toolbox_count > 0 )); then
check_info "Toolbox containers" "$toolbox_count available"
detect_toolbox_names | while read -r name; do
printf " ${DIM}%s${RESET}\n" "$name"
done
else
check_warn "Toolbox containers" "none — run 'make benchmark-setup'"
fi
# LLM stacks
log_header "LLM Stacks"
check_info "LM Studio" "$(detect_stack_lmstudio)"
check_info "opencode" "$(detect_stack_opencode)"
check_info "ollama" "$(detect_stack_ollama)"
check_info "llama.cpp (native)" "$(detect_stack_llamacpp)"
# Sensors
log_header "Current Sensors"
gpu_temp="$(detect_gpu_temp)"
gpu_power="$(detect_gpu_power)"
gpu_busy="$(detect_gpu_busy)"
print_kv "GPU Temperature" "$(echo "scale=1; $gpu_temp / 1000" | bc) C"
print_kv "GPU Power" "$(echo "scale=1; $gpu_power / 1000000" | bc) W"
print_kv "GPU Utilization" "${gpu_busy}%"
# Overall score
log_header "Optimization Score"
printf "\n ${BOLD}%d / %d${RESET} checks passing\n" "$score" "$total"
if (( score == total )); then
printf " ${GREEN}System is fully optimized!${RESET}\n"
elif (( score >= total / 2 )); then
printf " ${YELLOW}Partially optimized — run 'make optimize' for improvements${RESET}\n"
else
printf " ${RED}Significant optimizations available — run 'make optimize'${RESET}\n"
fi
echo ""

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env bash
# Full system report — detailed audit with JSON + text output
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../../lib/common.sh"
source "$SCRIPT_DIR/../../lib/detect.sh"
source "$SCRIPT_DIR/../../lib/format.sh"
JSON_ONLY=false
[[ "${1:-}" == "--json" ]] && JSON_ONLY=true
# ── Gather all data ──────────────────────────────────────
ts="$(timestamp)"
cpu_model="$(detect_cpu_model)"
cpu_threads="$(detect_cpu_cores)"
cpu_physical="$(detect_cpu_physical)"
gpu_name="$(detect_gpu_name)"
gpu_device_id="$(detect_gpu_device_id)"
kernel="$(detect_kernel_version)"
firmware="$(detect_firmware_version)"
vram_total="$(detect_vram_total)"
vram_used="$(detect_vram_used)"
gtt_total="$(detect_gtt_total)"
gtt_used="$(detect_gtt_used)"
ram_kb="$(detect_system_ram_kb)"
param_iommu="$(detect_kernel_param 'iommu')"
param_gttsize="$(detect_gttsize_param)"
param_pages="$(detect_pages_limit_param)"
cmdline="$(cat /proc/cmdline)"
tuned="$(detect_tuned_profile)"
rocm_ver="$(detect_rocm_version)"
vulkan_drv="$(detect_vulkan_driver)"
vulkan_ver="$(detect_vulkan_version)"
gpu_temp="$(detect_gpu_temp)"
gpu_power="$(detect_gpu_power)"
gpu_busy="$(detect_gpu_busy)"
rec_gttsize="$(recommended_gttsize_mib)"
rec_pages="$(recommended_pages_limit)"
# Toolbox list
toolboxes_json="[]"
if is_cmd toolbox; then
toolboxes_json="$(detect_toolbox_names | jq -R . | jq -s . 2>/dev/null || echo '[]')"
fi
# LLM stacks
stack_ollama="$(detect_stack_ollama)"
stack_lmstudio="$(detect_stack_lmstudio)"
stack_llamacpp="$(detect_stack_llamacpp)"
stack_opencode="$(detect_stack_opencode)"
# ROCm packages
rocm_pkgs="$(detect_rocm_packages | head -30)"
# ── Build JSON (all data via env vars — no shell interpolation into Python) ──
json_report="$(
SR_TS="$ts" \
SR_CPU_MODEL="$cpu_model" SR_CPU_CORES="$cpu_physical" SR_CPU_THREADS="$cpu_threads" \
SR_GPU_NAME="$gpu_name" SR_GPU_DEVICE_ID="$gpu_device_id" SR_RAM_KB="$ram_kb" \
SR_VRAM_TOTAL="$vram_total" SR_VRAM_USED="$vram_used" \
SR_GTT_TOTAL="$gtt_total" SR_GTT_USED="$gtt_used" \
SR_REC_GTTSIZE="$rec_gttsize" SR_REC_PAGES="$rec_pages" \
SR_KERNEL="$kernel" SR_CMDLINE="$cmdline" \
SR_PARAM_IOMMU="$param_iommu" SR_PARAM_GTTSIZE="$param_gttsize" SR_PARAM_PAGES="$param_pages" \
SR_FIRMWARE="$firmware" SR_TUNED="$tuned" SR_ROCM="$rocm_ver" \
SR_VULKAN_DRV="$vulkan_drv" SR_VULKAN_VER="${vulkan_ver:-}" \
SR_GPU_TEMP="$gpu_temp" SR_GPU_POWER="$gpu_power" SR_GPU_BUSY="$gpu_busy" \
SR_TOOLBOXES="$toolboxes_json" \
SR_STACK_OLLAMA="$stack_ollama" SR_STACK_LMSTUDIO="$stack_lmstudio" \
SR_STACK_LLAMACPP="$stack_llamacpp" SR_STACK_OPENCODE="$stack_opencode" \
python3 -c '
import json, os
e = os.environ
data = {
"timestamp": e["SR_TS"],
"hardware": {
"cpu_model": e["SR_CPU_MODEL"],
"cpu_cores": int(e["SR_CPU_CORES"]),
"cpu_threads": int(e["SR_CPU_THREADS"]),
"gpu_name": e["SR_GPU_NAME"],
"gpu_device_id": e["SR_GPU_DEVICE_ID"],
"system_ram_kb": int(e["SR_RAM_KB"]),
},
"memory": {
"vram_total_bytes": int(e["SR_VRAM_TOTAL"]),
"vram_used_bytes": int(e["SR_VRAM_USED"]),
"gtt_total_bytes": int(e["SR_GTT_TOTAL"]),
"gtt_used_bytes": int(e["SR_GTT_USED"]),
"recommended_gttsize_mib": int(e["SR_REC_GTTSIZE"]),
"recommended_pages_limit": int(e["SR_REC_PAGES"]),
},
"kernel": {
"version": e["SR_KERNEL"],
"cmdline": e["SR_CMDLINE"],
"param_iommu": e["SR_PARAM_IOMMU"],
"param_gttsize": e["SR_PARAM_GTTSIZE"],
"param_pages_limit": e["SR_PARAM_PAGES"],
},
"firmware": e["SR_FIRMWARE"],
"tuned_profile": e["SR_TUNED"],
"rocm_version": e["SR_ROCM"],
"vulkan": {
"driver": e["SR_VULKAN_DRV"],
"version": e["SR_VULKAN_VER"],
},
"sensors": {
"gpu_temp_mc": int(e["SR_GPU_TEMP"]),
"gpu_power_uw": int(e["SR_GPU_POWER"]),
"gpu_busy_pct": int(e["SR_GPU_BUSY"]),
},
"toolboxes": json.loads(e["SR_TOOLBOXES"]),
"stacks": {
"ollama": e["SR_STACK_OLLAMA"],
"lmstudio": e["SR_STACK_LMSTUDIO"],
"llamacpp": e["SR_STACK_LLAMACPP"],
"opencode": e["SR_STACK_OPENCODE"],
},
}
print(json.dumps(data, indent=2))
'
)"
if $JSON_ONLY; then
echo "$json_report" | python3 -m json.tool 2>/dev/null || echo "$json_report"
exit 0
fi
# ── Save report ──────────────────────────────────────────
audit_dir="$(data_dir audits)"
json_file="$audit_dir/report-${ts}.json"
text_file="$audit_dir/report-${ts}.txt"
echo "$json_report" | python3 -m json.tool > "$json_file" 2>/dev/null || echo "$json_report" > "$json_file"
# ── Text output (also saved) ────────────────────────────
{
printf "Strix Halo Full System Report — %s\n" "$ts"
printf "=%.0s" {1..60}; echo
printf "\nHardware:\n"
printf " CPU: %s (%sC/%sT)\n" "$cpu_model" "$cpu_physical" "$cpu_threads"
printf " GPU: %s (device: 0x%s)\n" "$gpu_name" "$gpu_device_id"
printf " RAM: %s KB\n" "$ram_kb"
printf "\nMemory Allocation:\n"
printf " VRAM total: %s (used: %s)\n" "$(human_bytes "$vram_total")" "$(human_bytes "$vram_used")"
printf " GTT total: %s (used: %s)\n" "$(human_bytes "$gtt_total")" "$(human_bytes "$gtt_used")"
printf " Recommended: gttsize=%s MiB, pages_limit=%s\n" "$rec_gttsize" "$rec_pages"
printf "\nKernel:\n"
printf " Version: %s\n" "$kernel"
printf " Firmware: %s\n" "$firmware"
printf " Cmdline: %s\n" "$cmdline"
printf " iommu: %s\n" "${param_iommu:-not set}"
printf " gttsize: %s\n" "${param_gttsize:-not set}"
printf " pages_limit:%s\n" "${param_pages:-not set}"
printf "\nPerformance:\n"
printf " Tuned: %s\n" "$tuned"
printf " GPU temp: %s C\n" "$(echo "scale=1; $gpu_temp / 1000" | bc)"
printf " GPU power: %s W\n" "$(echo "scale=1; $gpu_power / 1000000" | bc)"
printf " GPU busy: %s%%\n" "$gpu_busy"
printf "\nSoftware:\n"
printf " ROCm: %s\n" "$rocm_ver"
printf " Vulkan: %s %s\n" "$vulkan_drv" "$vulkan_ver"
printf "\nROCm Packages:\n"
echo "$rocm_pkgs" | sed 's/^/ /'
printf "\nToolboxes:\n"
if [[ "$toolboxes_json" == "[]" ]]; then
printf " none\n"
else
echo "$toolboxes_json" | python3 -c "import sys,json; [print(f' {x}') for x in json.load(sys.stdin)]" 2>/dev/null || printf " (parse error)\n"
fi
printf "\nLLM Stacks:\n"
printf " ollama: %s\n" "$stack_ollama"
printf " LM Studio: %s\n" "$stack_lmstudio"
printf " llama.cpp: %s\n" "$stack_llamacpp"
printf " opencode: %s\n" "$stack_opencode"
} | tee "$text_file"
echo ""
log_success "Report saved to:"
log_info " JSON: $json_file"
log_info " Text: $text_file"