Initial commit
This commit is contained in:
90
scripts/monitor/dashboard.sh
Normal file
90
scripts/monitor/dashboard.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tmux-based monitoring dashboard
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
|
||||
SESSION="strix-monitor"
|
||||
SIMPLE=false
|
||||
WITH_LOG=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--simple|-s) SIMPLE=true; shift ;;
|
||||
--with-logging|-l) WITH_LOG=true; shift ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Simple mode: just launch amdgpu_top
|
||||
if $SIMPLE; then
|
||||
if is_cmd amdgpu_top; then
|
||||
exec amdgpu_top
|
||||
elif is_cmd nvtop; then
|
||||
log_warn "amdgpu_top not found, falling back to nvtop"
|
||||
exec nvtop
|
||||
else
|
||||
log_error "No GPU monitor installed. Run: make monitor-install"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Full dashboard requires tmux
|
||||
if ! is_cmd tmux; then
|
||||
log_error "tmux is required for dashboard mode. Run: make monitor-install"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Pick GPU monitor
|
||||
GPU_MON="nvtop"
|
||||
if is_cmd amdgpu_top; then
|
||||
GPU_MON="amdgpu_top"
|
||||
fi
|
||||
|
||||
# Pick system monitor
|
||||
SYS_MON="htop"
|
||||
if is_cmd btop; then
|
||||
SYS_MON="btop"
|
||||
elif ! is_cmd htop; then
|
||||
SYS_MON="top"
|
||||
fi
|
||||
|
||||
# Kill existing session if running
|
||||
tmux kill-session -t "$SESSION" 2>/dev/null || true
|
||||
|
||||
# Start background logging if requested
|
||||
LOG_CMD="echo Metric logging not active. Use --with-logging to enable.; read -r"
|
||||
LOG_PID=""
|
||||
if $WITH_LOG; then
|
||||
LOG_FILE="$(data_dir logs)/metrics-$(timestamp).csv"
|
||||
bash "$SCRIPT_DIR/log-metrics.sh" --output "$LOG_FILE" &
|
||||
LOG_PID=$!
|
||||
LOG_CMD="tail -f \"$LOG_FILE\""
|
||||
log_info "Metric logger started (PID: $LOG_PID) → $LOG_FILE"
|
||||
fi
|
||||
|
||||
# Cleanup logger on exit
|
||||
cleanup() {
|
||||
if [[ -n "$LOG_PID" ]]; then
|
||||
kill "$LOG_PID" 2>/dev/null || true
|
||||
wait "$LOG_PID" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Create tmux layout
|
||||
# +--------------------+--------------------+
|
||||
# | GPU monitor | System monitor |
|
||||
# | | |
|
||||
# +--------------------------------------------+
|
||||
# | Metrics log tail / status |
|
||||
# +--------------------------------------------+
|
||||
tmux new-session -d -s "$SESSION" -x "$(tput cols 2>/dev/null || echo 120)" -y "$(tput lines 2>/dev/null || echo 40)" "$GPU_MON"
|
||||
tmux split-window -t "$SESSION" -h "$SYS_MON"
|
||||
tmux split-window -t "$SESSION" -v -p 20 "$LOG_CMD"
|
||||
tmux select-pane -t "$SESSION:0.0"
|
||||
|
||||
log_info "Dashboard started. Attach with: tmux attach -t $SESSION"
|
||||
log_info "Detach with Ctrl+B then D. Kill with: tmux kill-session -t $SESSION"
|
||||
tmux attach -t "$SESSION"
|
||||
97
scripts/monitor/install-tools.sh
Normal file
97
scripts/monitor/install-tools.sh
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
# Install monitoring tools for Strix Halo
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
|
||||
log_header "Monitoring Tools Installer"
|
||||
|
||||
# ── amdgpu_top (most important) ─────────────────────────
|
||||
if is_cmd amdgpu_top; then
|
||||
log_success "amdgpu_top already installed: $(amdgpu_top --version 2>&1 | head -1)"
|
||||
else
|
||||
log_info "Installing amdgpu_top (best AMD GPU monitor)..."
|
||||
installed=false
|
||||
|
||||
# Method 1: Install RPM from GitHub releases (fastest, works on Fedora)
|
||||
if ! $installed; then
|
||||
log_info "Downloading pre-built RPM from GitHub releases..."
|
||||
AMDGPU_TOP_VERSION="0.11.2"
|
||||
RPM_URL="https://github.com/Umio-Yasuno/amdgpu_top/releases/download/v${AMDGPU_TOP_VERSION}/amdgpu_top-${AMDGPU_TOP_VERSION}-1.x86_64.rpm"
|
||||
RPM_FILE="/tmp/amdgpu_top-${AMDGPU_TOP_VERSION}.rpm"
|
||||
|
||||
if curl -fsSL -o "$RPM_FILE" "$RPM_URL" 2>/dev/null; then
|
||||
if sudo dnf install -y "$RPM_FILE" 2>&1; then
|
||||
installed=true
|
||||
log_success "amdgpu_top installed from RPM"
|
||||
rm -f "$RPM_FILE"
|
||||
else
|
||||
log_warn "RPM install failed"
|
||||
fi
|
||||
else
|
||||
log_warn "RPM download failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Method 2: Try dnf repos
|
||||
if ! $installed; then
|
||||
log_info "Trying dnf repos..."
|
||||
if sudo dnf install -y amdgpu_top 2>/dev/null; then
|
||||
installed=true
|
||||
log_success "amdgpu_top installed via dnf"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Method 3: cargo (if available)
|
||||
if ! $installed && is_cmd cargo; then
|
||||
log_info "Building from source via cargo..."
|
||||
if cargo install amdgpu_top 2>&1; then
|
||||
installed=true
|
||||
log_success "amdgpu_top installed via cargo"
|
||||
else
|
||||
log_warn "cargo install failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! $installed; then
|
||||
log_warn "Could not install amdgpu_top automatically."
|
||||
log_info "Manual options:"
|
||||
log_info " 1. Download RPM: curl -LO $RPM_URL && sudo dnf install ./amdgpu_top-*.rpm"
|
||||
log_info " 2. Download AppImage: https://github.com/Umio-Yasuno/amdgpu_top/releases/latest"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── btop ─────────────────────────────────────────────────
|
||||
if is_cmd btop; then
|
||||
log_success "btop already installed"
|
||||
else
|
||||
log_info "Installing btop..."
|
||||
if sudo dnf install -y btop 2>&1; then
|
||||
log_success "btop installed"
|
||||
else
|
||||
log_warn "Could not install btop via dnf"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── tmux (needed for dashboard) ──────────────────────────
|
||||
if is_cmd tmux; then
|
||||
log_success "tmux already installed"
|
||||
else
|
||||
log_info "Installing tmux..."
|
||||
if sudo dnf install -y tmux 2>&1; then
|
||||
log_success "tmux installed"
|
||||
else
|
||||
log_warn "Could not install tmux via dnf"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Verify existing tools ───────────────────────────────
|
||||
log_header "Monitoring Tools Status"
|
||||
for tool in amdgpu_top nvtop btop amd-smi rocm-smi tmux; do
|
||||
if is_cmd "$tool"; then
|
||||
log_success "$tool"
|
||||
else
|
||||
log_warn "$tool — not installed"
|
||||
fi
|
||||
done
|
||||
127
scripts/monitor/log-metrics.sh
Normal file
127
scripts/monitor/log-metrics.sh
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env bash
|
||||
# Background metric collector — samples GPU and system stats to CSV
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
source "$SCRIPT_DIR/../../lib/detect.sh"
|
||||
|
||||
INTERVAL=2
|
||||
OUTPUT=""
|
||||
DURATION=0 # 0 = indefinite
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output|-o) OUTPUT="$2"; shift 2 ;;
|
||||
--interval|-i) INTERVAL="$2"; shift 2 ;;
|
||||
--duration|-d) DURATION="$2"; shift 2 ;;
|
||||
--help|-h)
|
||||
echo "Usage: log-metrics.sh [--output FILE] [--interval SECS] [--duration SECS]"
|
||||
exit 0 ;;
|
||||
*) log_warn "Unknown argument: $1"; shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate numeric args
|
||||
[[ "$INTERVAL" =~ ^[0-9]+$ ]] || { log_error "--interval must be a positive integer"; exit 1; }
|
||||
[[ "$DURATION" =~ ^[0-9]+$ ]] || { log_error "--duration must be a positive integer"; exit 1; }
|
||||
|
||||
if [[ -z "$OUTPUT" ]]; then
|
||||
OUTPUT="$(data_dir logs)/metrics-$(timestamp).csv"
|
||||
fi
|
||||
|
||||
mkdir -p "$(dirname "$OUTPUT")"
|
||||
|
||||
# Cache sysfs paths once (avoid re-globbing every iteration)
|
||||
SYSFS_GPU_BUSY="$GPU_SYSFS/gpu_busy_percent"
|
||||
SYSFS_VRAM_USED="$GPU_SYSFS/mem_info_vram_used"
|
||||
SYSFS_GTT_USED="$GPU_SYSFS/mem_info_gtt_used"
|
||||
SYSFS_TEMP=""
|
||||
SYSFS_POWER=""
|
||||
for f in "$GPU_SYSFS"/hwmon/hwmon*/temp1_input; do
|
||||
[[ -f "$f" ]] && SYSFS_TEMP="$f" && break
|
||||
done
|
||||
for f in "$GPU_SYSFS"/hwmon/hwmon*/power1_average; do
|
||||
[[ -f "$f" ]] && SYSFS_POWER="$f" && break
|
||||
done
|
||||
|
||||
# Write CSV header
|
||||
echo "timestamp,gpu_busy_pct,vram_used_mib,gtt_used_mib,gpu_temp_c,gpu_power_w,cpu_pct,ram_used_mib" > "$OUTPUT"
|
||||
|
||||
log_info "Logging metrics every ${INTERVAL}s → $OUTPUT"
|
||||
[[ $DURATION -gt 0 ]] && log_info "Will stop after ${DURATION}s"
|
||||
|
||||
start_time=$SECONDS
|
||||
stopped=false
|
||||
|
||||
cleanup() {
|
||||
$stopped && return
|
||||
stopped=true
|
||||
local lines
|
||||
lines=$(( $(wc -l < "$OUTPUT") - 1 ))
|
||||
log_info "Metric logger stopped. $lines samples in $OUTPUT"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Read /proc/stat fields into variables using bash builtins
|
||||
read_cpu_stat() {
|
||||
local line
|
||||
read -r line < /proc/stat
|
||||
# "cpu user nice system idle iowait irq softirq steal"
|
||||
set -- $line
|
||||
shift # drop "cpu"
|
||||
CPU_TOTAL=$(( $1 + $2 + $3 + $4 + $5 + $6 + $7 + ${8:-0} ))
|
||||
CPU_IDLE=$4
|
||||
}
|
||||
|
||||
while true; do
|
||||
ts="$(printf '%(%Y-%m-%d %H:%M:%S)T' -1)"
|
||||
|
||||
# GPU metrics — direct reads, no subshells
|
||||
read -r gpu_busy < "$SYSFS_GPU_BUSY" 2>/dev/null || gpu_busy=0
|
||||
read -r vram_bytes < "$SYSFS_VRAM_USED" 2>/dev/null || vram_bytes=0
|
||||
read -r gtt_bytes < "$SYSFS_GTT_USED" 2>/dev/null || gtt_bytes=0
|
||||
read -r temp_mc < "$SYSFS_TEMP" 2>/dev/null || temp_mc=0
|
||||
read -r power_uw < "$SYSFS_POWER" 2>/dev/null || power_uw=0
|
||||
|
||||
vram_mib=$(( vram_bytes / 1048576 ))
|
||||
gtt_mib=$(( gtt_bytes / 1048576 ))
|
||||
gpu_temp_c=$(( temp_mc / 1000 )).$(( (temp_mc % 1000) / 100 ))
|
||||
gpu_power_w=$(( power_uw / 1000000 )).$(( (power_uw % 1000000) / 100000 ))
|
||||
|
||||
# CPU usage (snapshot delta)
|
||||
read_cpu_stat
|
||||
prev_total=$CPU_TOTAL
|
||||
prev_idle=$CPU_IDLE
|
||||
sleep 0.1
|
||||
read_cpu_stat
|
||||
delta_total=$(( CPU_TOTAL - prev_total ))
|
||||
delta_idle=$(( CPU_IDLE - prev_idle ))
|
||||
if (( delta_total > 0 )); then
|
||||
cpu_pct=$(( (delta_total - delta_idle) * 1000 / delta_total ))
|
||||
# Format N as N/10 . N%10, handling single-digit values (e.g., 5 → 0.5)
|
||||
cpu_pct_fmt="$(( cpu_pct / 10 )).$(( cpu_pct % 10 ))"
|
||||
else
|
||||
cpu_pct_fmt="0.0"
|
||||
fi
|
||||
|
||||
# RAM used (bash builtins only)
|
||||
local_mem_total=0
|
||||
local_mem_avail=0
|
||||
while IFS=': ' read -r key val _; do
|
||||
case "$key" in
|
||||
MemTotal) local_mem_total=$val ;;
|
||||
MemAvailable) local_mem_avail=$val; break ;;
|
||||
esac
|
||||
done < /proc/meminfo
|
||||
ram_used_mib=$(( (local_mem_total - local_mem_avail) / 1024 ))
|
||||
|
||||
echo "$ts,$gpu_busy,$vram_mib,$gtt_mib,$gpu_temp_c,$gpu_power_w,$cpu_pct_fmt,$ram_used_mib" >> "$OUTPUT"
|
||||
|
||||
# Check duration
|
||||
if (( DURATION > 0 && SECONDS - start_time >= DURATION )); then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
Reference in New Issue
Block a user