Initial commit
This commit is contained in:
127
scripts/monitor/log-metrics.sh
Normal file
127
scripts/monitor/log-metrics.sh
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env bash
|
||||
# Background metric collector — samples GPU and system stats to CSV
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
source "$SCRIPT_DIR/../../lib/detect.sh"
|
||||
|
||||
INTERVAL=2
|
||||
OUTPUT=""
|
||||
DURATION=0 # 0 = indefinite
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output|-o) OUTPUT="$2"; shift 2 ;;
|
||||
--interval|-i) INTERVAL="$2"; shift 2 ;;
|
||||
--duration|-d) DURATION="$2"; shift 2 ;;
|
||||
--help|-h)
|
||||
echo "Usage: log-metrics.sh [--output FILE] [--interval SECS] [--duration SECS]"
|
||||
exit 0 ;;
|
||||
*) log_warn "Unknown argument: $1"; shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate numeric args
|
||||
[[ "$INTERVAL" =~ ^[0-9]+$ ]] || { log_error "--interval must be a positive integer"; exit 1; }
|
||||
[[ "$DURATION" =~ ^[0-9]+$ ]] || { log_error "--duration must be a positive integer"; exit 1; }
|
||||
|
||||
if [[ -z "$OUTPUT" ]]; then
|
||||
OUTPUT="$(data_dir logs)/metrics-$(timestamp).csv"
|
||||
fi
|
||||
|
||||
mkdir -p "$(dirname "$OUTPUT")"
|
||||
|
||||
# Cache sysfs paths once (avoid re-globbing every iteration)
|
||||
SYSFS_GPU_BUSY="$GPU_SYSFS/gpu_busy_percent"
|
||||
SYSFS_VRAM_USED="$GPU_SYSFS/mem_info_vram_used"
|
||||
SYSFS_GTT_USED="$GPU_SYSFS/mem_info_gtt_used"
|
||||
SYSFS_TEMP=""
|
||||
SYSFS_POWER=""
|
||||
for f in "$GPU_SYSFS"/hwmon/hwmon*/temp1_input; do
|
||||
[[ -f "$f" ]] && SYSFS_TEMP="$f" && break
|
||||
done
|
||||
for f in "$GPU_SYSFS"/hwmon/hwmon*/power1_average; do
|
||||
[[ -f "$f" ]] && SYSFS_POWER="$f" && break
|
||||
done
|
||||
|
||||
# Write CSV header
|
||||
echo "timestamp,gpu_busy_pct,vram_used_mib,gtt_used_mib,gpu_temp_c,gpu_power_w,cpu_pct,ram_used_mib" > "$OUTPUT"
|
||||
|
||||
log_info "Logging metrics every ${INTERVAL}s → $OUTPUT"
|
||||
[[ $DURATION -gt 0 ]] && log_info "Will stop after ${DURATION}s"
|
||||
|
||||
start_time=$SECONDS
|
||||
stopped=false
|
||||
|
||||
cleanup() {
|
||||
$stopped && return
|
||||
stopped=true
|
||||
local lines
|
||||
lines=$(( $(wc -l < "$OUTPUT") - 1 ))
|
||||
log_info "Metric logger stopped. $lines samples in $OUTPUT"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Read /proc/stat fields into variables using bash builtins
|
||||
read_cpu_stat() {
|
||||
local line
|
||||
read -r line < /proc/stat
|
||||
# "cpu user nice system idle iowait irq softirq steal"
|
||||
set -- $line
|
||||
shift # drop "cpu"
|
||||
CPU_TOTAL=$(( $1 + $2 + $3 + $4 + $5 + $6 + $7 + ${8:-0} ))
|
||||
CPU_IDLE=$4
|
||||
}
|
||||
|
||||
while true; do
|
||||
ts="$(printf '%(%Y-%m-%d %H:%M:%S)T' -1)"
|
||||
|
||||
# GPU metrics — direct reads, no subshells
|
||||
read -r gpu_busy < "$SYSFS_GPU_BUSY" 2>/dev/null || gpu_busy=0
|
||||
read -r vram_bytes < "$SYSFS_VRAM_USED" 2>/dev/null || vram_bytes=0
|
||||
read -r gtt_bytes < "$SYSFS_GTT_USED" 2>/dev/null || gtt_bytes=0
|
||||
read -r temp_mc < "$SYSFS_TEMP" 2>/dev/null || temp_mc=0
|
||||
read -r power_uw < "$SYSFS_POWER" 2>/dev/null || power_uw=0
|
||||
|
||||
vram_mib=$(( vram_bytes / 1048576 ))
|
||||
gtt_mib=$(( gtt_bytes / 1048576 ))
|
||||
gpu_temp_c=$(( temp_mc / 1000 )).$(( (temp_mc % 1000) / 100 ))
|
||||
gpu_power_w=$(( power_uw / 1000000 )).$(( (power_uw % 1000000) / 100000 ))
|
||||
|
||||
# CPU usage (snapshot delta)
|
||||
read_cpu_stat
|
||||
prev_total=$CPU_TOTAL
|
||||
prev_idle=$CPU_IDLE
|
||||
sleep 0.1
|
||||
read_cpu_stat
|
||||
delta_total=$(( CPU_TOTAL - prev_total ))
|
||||
delta_idle=$(( CPU_IDLE - prev_idle ))
|
||||
if (( delta_total > 0 )); then
|
||||
cpu_pct=$(( (delta_total - delta_idle) * 1000 / delta_total ))
|
||||
# Format N as N/10 . N%10, handling single-digit values (e.g., 5 → 0.5)
|
||||
cpu_pct_fmt="$(( cpu_pct / 10 )).$(( cpu_pct % 10 ))"
|
||||
else
|
||||
cpu_pct_fmt="0.0"
|
||||
fi
|
||||
|
||||
# RAM used (bash builtins only)
|
||||
local_mem_total=0
|
||||
local_mem_avail=0
|
||||
while IFS=': ' read -r key val _; do
|
||||
case "$key" in
|
||||
MemTotal) local_mem_total=$val ;;
|
||||
MemAvailable) local_mem_avail=$val; break ;;
|
||||
esac
|
||||
done < /proc/meminfo
|
||||
ram_used_mib=$(( (local_mem_total - local_mem_avail) / 1024 ))
|
||||
|
||||
echo "$ts,$gpu_busy,$vram_mib,$gtt_mib,$gpu_temp_c,$gpu_power_w,$cpu_pct_fmt,$ram_used_mib" >> "$OUTPUT"
|
||||
|
||||
# Check duration
|
||||
if (( DURATION > 0 && SECONDS - start_time >= DURATION )); then
|
||||
break
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
Reference in New Issue
Block a user