feat(optimize): add Phase 2 power profile and system tuning
Add `make optimize-power` (ryzenadj 85W, sysctl, THP, RADV nogttspill) with systemd services for boot/resume persistence. Integrate into `make optimize --all` as Phase 2. Update optimization log with RyzenAdj results (+46% tg at 70W sustained), KV sweep data, and quant shootout. Add Qwen3-Coder-30B and Nemotron-Cascade-2 to model catalog.
This commit is contained in:
111
scripts/optimize/power-profile.sh
Normal file
111
scripts/optimize/power-profile.sh
Normal file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env bash
|
||||
# Apply power profile and system tuning for LLM inference workloads
|
||||
# Requires root. Settings are volatile — use the systemd service for persistence.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/../../lib/common.sh"
|
||||
source "$SCRIPT_DIR/../../lib/format.sh"
|
||||
|
||||
require_root
|
||||
|
||||
# ── Power limits via ryzenadj ─────────────────────────────
|
||||
STAPM=85000
|
||||
FAST=85000
|
||||
SLOW=85000
|
||||
APU_SLOW=85000
|
||||
|
||||
if is_cmd ryzenadj; then
|
||||
log_header "Power Profile (ryzenadj)"
|
||||
log_info "Setting STAPM=${STAPM}mW, Fast=${FAST}mW, Slow=${SLOW}mW, APU=${APU_SLOW}mW"
|
||||
ryzenadj \
|
||||
--stapm-limit=$STAPM \
|
||||
--fast-limit=$FAST \
|
||||
--slow-limit=$SLOW \
|
||||
--apu-slow-limit=$APU_SLOW 2>&1 | grep -E 'Successfully|Error|not supported' || true
|
||||
|
||||
# Verify what actually took effect
|
||||
log_info "Verifying limits..."
|
||||
ryzenadj -i 2>&1 | grep -E 'LIMIT|VALUE' | head -8
|
||||
echo ""
|
||||
log_warn "Note: HP firmware may cap PPT SLOW/APU at 70W regardless of setting"
|
||||
else
|
||||
log_error "ryzenadj not found. Install: cd /tmp && git clone https://github.com/FlyGoat/RyzenAdj.git && cd RyzenAdj && mkdir build && cd build && cmake .. && make && sudo cp ryzenadj /usr/local/bin/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── VM sysctl tuning ──────────────────────────────────────
|
||||
log_header "VM Sysctl Tuning"
|
||||
|
||||
declare -A SYSCTLS=(
|
||||
[vm.swappiness]=1
|
||||
[vm.dirty_ratio]=40
|
||||
[vm.dirty_background_ratio]=10
|
||||
[vm.max_map_count]=500000
|
||||
[vm.zone_reclaim_mode]=0
|
||||
)
|
||||
|
||||
for KEY in "${!SYSCTLS[@]}"; do
|
||||
VAL="${SYSCTLS[$KEY]}"
|
||||
CURRENT=$(sysctl -n "$KEY" 2>/dev/null || echo "?")
|
||||
if [[ "$CURRENT" == "$VAL" ]]; then
|
||||
log_success "$KEY = $VAL (already set)"
|
||||
else
|
||||
sysctl -w "$KEY=$VAL" > /dev/null 2>&1
|
||||
log_success "$KEY = $VAL (was $CURRENT)"
|
||||
fi
|
||||
done
|
||||
|
||||
# Persist sysctl settings
|
||||
SYSCTL_CONF="/etc/sysctl.d/99-llm-inference.conf"
|
||||
if [[ ! -f "$SYSCTL_CONF" ]]; then
|
||||
log_info "Persisting to $SYSCTL_CONF"
|
||||
cat > "$SYSCTL_CONF" << 'EOF'
|
||||
# LLM inference optimizations
|
||||
vm.swappiness = 1
|
||||
vm.dirty_ratio = 40
|
||||
vm.dirty_background_ratio = 10
|
||||
vm.max_map_count = 500000
|
||||
vm.zone_reclaim_mode = 0
|
||||
EOF
|
||||
log_success "Sysctl config saved (persists across reboots)"
|
||||
else
|
||||
log_info "Sysctl config already exists at $SYSCTL_CONF"
|
||||
fi
|
||||
|
||||
# ── Transparent Huge Pages ────────────────────────────────
|
||||
log_header "Transparent Huge Pages"
|
||||
|
||||
THP_ENABLED=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null || echo "unknown")
|
||||
if [[ "$THP_ENABLED" == *"[always]"* ]]; then
|
||||
log_success "THP = always (already set)"
|
||||
else
|
||||
echo always > /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null || true
|
||||
echo defer+madvise > /sys/kernel/mm/transparent_hugepage/defrag 2>/dev/null || true
|
||||
log_success "THP = always, defrag = defer+madvise"
|
||||
fi
|
||||
log_info "For persistence, add to kernel cmdline: transparent_hugepage=always"
|
||||
|
||||
# ── RADV nogttspill ───────────────────────────────────────
|
||||
log_header "Vulkan RADV Environment"
|
||||
|
||||
RADV_CONF="/etc/environment.d/radv-llm.conf"
|
||||
if [[ ! -f "$RADV_CONF" ]]; then
|
||||
mkdir -p /etc/environment.d
|
||||
echo 'RADV_PERFTEST=nogttspill' > "$RADV_CONF"
|
||||
log_success "RADV_PERFTEST=nogttspill persisted to $RADV_CONF"
|
||||
log_info "Takes effect on next login. For this session: export RADV_PERFTEST=nogttspill"
|
||||
else
|
||||
log_success "RADV config already exists at $RADV_CONF"
|
||||
fi
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────
|
||||
log_header "Phase 2 Optimization Summary"
|
||||
log_success "Power profile: ryzenadj limits applied (volatile — resets on reboot)"
|
||||
log_success "VM tuning: sysctl applied and persisted"
|
||||
log_success "THP: enabled (volatile — add to kernel cmdline for persistence)"
|
||||
log_success "RADV: nogttspill persisted"
|
||||
echo ""
|
||||
log_info "To persist ryzenadj across reboots:"
|
||||
log_info " sudo cp $SCRIPT_DIR/../../configs/ryzenadj-llm.service /etc/systemd/system/"
|
||||
log_info " sudo systemctl enable --now ryzenadj-llm.service"
|
||||
Reference in New Issue
Block a user