tuned/amdgpu: introduce config file, consolidated script

This commit is contained in:
Josh Lay 2024-08-05 08:12:02 -05:00
parent c08046495c
commit 43ba49770e
No known key found for this signature in database
GPG key ID: 47AA304B2243B579
9 changed files with 193 additions and 198 deletions

View file

@ -40,6 +40,22 @@
notify: Restart tuned
become: true
- name: Configure profile vars
ansible.builtin.lineinfile:
path: /etc/tuned/amdgpu-profile-vars.conf
line: "{{ item }}={{ vars[item] }}"
regexp: "^{{ item }}="
create: true
mode: '0644'
when: vars[item] is defined
with_items:
- gpu_clock_min
- gpu_clock_max
- gpumem_clock_static
- gpu_power_multi_def
- gpu_power_multi_oc
- gpu_mv_offset
- name: Create custom profile directories
ansible.builtin.file:
state: directory
@ -50,27 +66,13 @@
- "{{ base_profiles }}"
become: true
- name: Copy 'common' AMDGPU script for all profiles
- name: Template AMDGPU profile script
ansible.builtin.template:
src: profile-common.sh.j2
dest: "{{ (tuned_amdgpu_profile_dir, item.1 + '-amdgpu-' + item.0, 'amdgpu-common.sh') | ansible.builtin.path_join }}"
mode: "0644" # sourced, doesn't require executable bit
owner: root
group: root
notify: Restart tuned
with_nested:
- "{{ amdgpu_profiles }}"
- "{{ base_profiles }}"
become: true
- name: Template custom AMDGPU profile scripts
ansible.builtin.template:
src: amdgpu-profile-{{ item.0 }}.sh.j2
dest: "{{ (tuned_amdgpu_profile_dir, item.1 + '-amdgpu-' + item.0, 'amdgpu-clock.sh') | ansible.builtin.path_join }}"
src: amdgpu-profile.sh.j2
dest: "{{ (tuned_amdgpu_profile_dir, 'amdgpu-clock.sh') | ansible.builtin.path_join }}" # place in base dir for all profiles, called w/ arg
owner: root
group: root
mode: "0755"
loop: "{{ amdgpu_profiles | product(base_profiles) | list }}"
notify: Restart tuned
become: true

View file

@ -1,28 +0,0 @@
#!/bin/bash
{{ ansible_managed | comment }}
# This is the 'default' script; resetting amdgpu clock/power/voltage tunables
#
# AMDGPU driver/sysfs references:
# https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html
# https://docs.kernel.org/gpu/amdgpu/thermal.html
#
# start by including the 'common' script; determines card/hwmon dir/power profiles/power capability
. $(dirname "${BASH_SOURCE[0]}")/amdgpu-common.sh
{# begin the templated script for 'default' profiles to reset state #}
# set control mode back to auto
# attempts to dynamically set optimal power profile for (load) conditions
echo 'auto' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level
# reset any existing profile clock changes
echo 'r' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# adjust power limit using multiplier against board capability
POWER_LIM_DEFAULT=$(/usr/bin/awk -v m="$POWER_CAP" -v n={{ gpu_power_multi.default }} 'BEGIN {printf "%.0f", (m*n)}')
echo "$POWER_LIM_DEFAULT" | tee "${HWMON_DIR}/power1_cap"
# extract the power-saving profile ID number
PROF_DEFAULT_NUM=$(/usr/bin/awk '$0 ~ /BOOTUP_DEFAULT.*:/ {print $1}' <<< "$PROFILE_MODES")
# reset power/clock heuristics to power-saving
echo "${PROF_DEFAULT_NUM}" | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode

View file

@ -1,50 +0,0 @@
#!/bin/bash
{{ ansible_managed | comment }}
# This is the 'overclock' script; applies amdgpu clock/power/voltage tunables
#
# AMDGPU driver/sysfs references:
# https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html
# https://docs.kernel.org/gpu/amdgpu/thermal.html
#
# start by including the 'common' script; determines card/hwmon dir/power profiles/power capability
. $(dirname "${BASH_SOURCE[0]}")/amdgpu-common.sh
{# begin the templated script for 'overclocked' AMD GPU profiles based on the existing tuned profiles #}
# set the minimum GPU clock - for best performance, this should be near the maximum
# RX6000 series power management *sucks*
echo 's 0 {{ gpu_clock_min }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# set the maximum GPU clock
echo 's 1 {{ gpu_clock_max }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# set the GPU *memory* clock
# normally this would appear disregarded, memory clocked at the minimum allowed by the overdrive (OD) range
# it follows the core clock; if both 0/1 profiles for _it_ are high enough, the memory will follow
echo 'm 1 {{ gpumem_clock_static }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
{% if gpu_mv_offset is defined %}
# offset GPU voltage {{ gpu_mv_offset }}mV
echo 'vo {{ gpu_mv_offset }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
{% endif %}
# commit the changes
echo 'c' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# force GPU core and memory into highest clocks (fix flickering and poor power management)
# set manual control mode
# allows control via 'pp_dpm_mclk', 'pp_dpm_sclk', 'pp_dpm_pcie', 'pp_dpm_fclk', and 'pp_power_profile_mode' files
echo 'manual' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level
# adjust power limit using multiplier against board capability
POWER_LIM_OC=$(/usr/bin/awk -v m="$POWER_CAP" -v n={{ gpu_power_multi.overclock }} 'BEGIN {printf "%.0f", (m*n)}')
echo "$POWER_LIM_OC" | tee "${HWMON_DIR}/power1_cap"
# avoid display flickering, force OC'd memory to highest clock
echo '3' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_mclk
# extract the VR power profile ID number
PROF_VR_NUM=$(/usr/bin/awk '$0 ~ /VR.*:/ {print $1}' <<< "$PROFILE_MODES")
# force 'overclocked' profile to 'VR' power/clock heuristics
# latency/frame timing seemed favorable with relatively-close minimum clocks
echo "${PROF_VR_NUM}" | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode

View file

@ -1,58 +0,0 @@
#!/bin/bash
{{ ansible_managed | comment }}
# This is the 'peak' script; applies most-aggressive amdgpu clock/power/voltage tunables
#
# AMDGPU driver/sysfs references:
# https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html
# https://docs.kernel.org/gpu/amdgpu/thermal.html
#
# start by including the 'common' script; determines card/hwmon dir/power profiles/power capability
. $(dirname "${BASH_SOURCE[0]}")/amdgpu-common.sh
{# begin the templated script for 'overclocked' AMD GPU profiles based on the existing tuned profiles #}
# set the minimum GPU clock - for best performance, this should be near the maximum
# RX6000 series power management *sucks*
echo 's 0 {{ gpu_clock_min }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# set the maximum GPU clock
echo 's 1 {{ gpu_clock_max }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# set the GPU *memory* clock
# normally this would appear disregarded, memory clocked at the minimum allowed by the overdrive (OD) range
# it follows the core clock; if both 0/1 profiles for _it_ are high enough, the memory will follow
echo 'm 1 {{ gpumem_clock_static }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
{% if gpu_mv_offset is defined %}
# offset GPU voltage {{ gpu_mv_offset }}mV
echo 'vo {{ gpu_mv_offset }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
{% endif %}
# commit the changes
echo 'c' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# force GPU core and memory into highest clocks (fix flickering and poor power management)
# set manual control mode
# allows control via 'pp_dpm_mclk', 'pp_dpm_sclk', 'pp_dpm_pcie', 'pp_dpm_fclk', and 'pp_power_profile_mode' files
echo 'manual' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level
# adjust power limit using multiplier against board capability
POWER_LIM_OC=$(/usr/bin/awk -v m="$POWER_CAP" -v n={{ gpu_power_multi.overclock }} 'BEGIN {printf "%.0f", (m*n)}')
echo "$POWER_LIM_OC" | tee "${HWMON_DIR}/power1_cap"
# pp_dpm_*clk settings are unintuitive, giving profiles that may be used
echo '1' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_sclk
echo '3' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_mclk
echo '2' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_fclk
echo '2' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_socclk
# extract the VR power profile ID number
PROF_VR_NUM=$(/usr/bin/awk '$0 ~ /VR.*:/ {print $1}' <<< "$PROFILE_MODES")
# force 'overclocked' profile to 'VR' power/clock heuristics
# latency/frame timing seemed favorable with relatively-close minimum clocks
echo "${PROF_VR_NUM}" | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode
# note 4/8/2023: instead of 'manual'... try dealing with broken power management, force clocks to high
# ref: https://gitlab.freedesktop.org/drm/amd/-/issues/1500
# followup: doesn't work that well in practice, still flaky on clocks/frame times
#echo 'high' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level

View file

@ -0,0 +1,130 @@
#!/bin/bash
{{ ansible_managed | comment }}
# This is the script for 'amdgpu' profiles, (re)setting clock/power/voltage tunables
#
# AMDGPU driver/sysfs references:
# https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html
# https://docs.kernel.org/gpu/amdgpu/thermal.html
#
# Variables shown below named '$TUNED_...' are given values by '/etc/tuned/amdgpu-profile-vars.conf'
#
# determine the connected GPU using the DRM subsystem. FIXME: assumes one card, make configurable
CARD=$(/usr/bin/grep -ls ^connected /sys/class/drm/*/status | /usr/bin/grep -o 'card[0-9]' | /usr/bin/sort | /usr/bin/uniq | /usr/bin/sort -h | /usr/bin/tail -1)
function get_hwmon_dir() {
CARD_DIR="/sys/class/drm/${1}/device/"
for CANDIDATE in "${CARD_DIR}"/hwmon/hwmon*; do
if [[ -f "${CANDIDATE}"/power1_cap ]]; then
# found a valid hwmon dir
echo "${CANDIDATE}"
fi
done
}
# determine the hwmon directory
HWMON_DIR=$(get_hwmon_dir "${CARD}")
# read all of the power profiles, used to get the IDs for assignment later
PROFILE_MODES=$(< /sys/class/drm/"${CARD}"/device/pp_power_profile_mode)
# get power capability; later used determine limits
read -r -d '' POWER_CAP < "$HWMON_DIR"/power1_cap_max
function amdgpu_profile_reset() {
# set control mode back to auto
# attempts to dynamically set optimal power profile for (load) conditions
echo 'auto' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level
# reset any existing profile clock changes
echo 'r' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# adjust power limit using multiplier against board capability
POWER_LIM_DEFAULT=$(/usr/bin/awk -v m="$POWER_CAP" -v n="${TUNED_gpu_power_multi_def}" 'BEGIN {printf "%.0f", (m*n)}')
echo "$POWER_LIM_DEFAULT" | tee "${HWMON_DIR}/power1_cap"
# extract the power-saving profile ID number
PROF_DEFAULT_NUM=$(/usr/bin/awk '$0 ~ /BOOTUP_DEFAULT.*:/ {print $1}' <<< "$PROFILE_MODES")
# reset power/clock heuristics to power-saving
echo "${PROF_DEFAULT_NUM}" | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode
# delay before returning - have mercy, may be followed with other profile function calls
sleep 0.5
}
function amdgpu_profile_overclock() {
echo "s 0 ${TUNED_gpu_clock_min}" | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
echo "s 1 ${TUNED_gpu_clock_max}" | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
echo "m 1 ${TUNED_gpumem_clock_static}" | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# under/over-voltage is considered optional or less likely to be defined, checked before use
if [[ -n ${TUNED_gpu_mv_offset} ]]; then
echo "vo ${TUNED_gpu_mv_offset}" | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
fi
# commit the changes
echo 'c' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage
# force GPU core and memory into highest clocks (fix flickering and poor power management)
# set manual control mode
# allows control via 'pp_dpm_mclk', 'pp_dpm_sclk', 'pp_dpm_pcie', 'pp_dpm_fclk', and 'pp_power_profile_mode' files
echo 'manual' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level
# adjust power limit using multiplier against board capability
POWER_LIM_OC=$(/usr/bin/awk -v m="$POWER_CAP" -v n="${TUNED_gpu_power_multi_oc}" 'BEGIN {printf "%.0f", (m*n)}')
echo "$POWER_LIM_OC" | tee "${HWMON_DIR}/power1_cap"
# avoid display flickering, force OC'd memory to highest clock
echo '3' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_mclk
# extract the VR power profile ID number
PROF_VR_NUM=$(/usr/bin/awk '$0 ~ /VR.*:/ {print $1}' <<< "$PROFILE_MODES")
# force 'overclocked' profile to 'VR' power/clock heuristics
# latency/frame timing seemed favorable with relatively-close minimum clocks
echo "${PROF_VR_NUM}" | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode
# delay before returning - have mercy, may be followed with other profile function calls
sleep 0.5
}
function amdgpu_profile_help() {
echo "Usage: $0 {reset|overclock|peak}"
exit 1
}
# Check if an argument was provided
if [ -z "$1" ]; then
amdgpu_profile_help
fi
# 'tuned' has trouble running scripts with args - treats '/path/to/script.sh arg1' as one complete item
# so, based on the active profile - run the right function
read -r -d '' TUNED_PROFILE < /etc/tuned/active_profile
case "$TUNED_PROFILE" in
*-amdgpu-default)
amdgpu_profile_reset
;;
*-amdgpu-overclock)
amdgpu_profile_reset
amdgpu_profile_overclock
;;
*-amdgpu-peak)
# do everything the other profiles do... then set each clock table to their peak
amdgpu_profile_reset
amdgpu_profile_overclock
echo '1' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_sclk
echo '3' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_mclk
echo '2' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_fclk
echo '2' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_socclk
# if I understand correctly, 'power_dpm_force_performance_level' at 'profile_peak' achieves similar... but precludes some control
;;
*)
amdgpu_profile_help
;;
esac

View file

@ -1,31 +0,0 @@
#!/bin/bash
{{ ansible_managed | comment }}
# 'common' file sourced by other scripts under tuned profile
#
# dynamically determine the connected GPU using the DRM subsystem
CARD=$(/usr/bin/grep -ls ^connected /sys/class/drm/*/status | /usr/bin/grep -o 'card[0-9]' | /usr/bin/sort | /usr/bin/uniq | /usr/bin/sort -h | /usr/bin/tail -1)
function get_hwmon_dir() {
CARD_DIR="/sys/class/drm/${1}/device/"
for CANDIDATE in "${CARD_DIR}"/hwmon/hwmon*; do
if [[ -f "${CANDIDATE}"/power1_cap ]]; then
# found a valid hwmon dir
echo "${CANDIDATE}"
fi
done
}
# determine the hwmon directory
HWMON_DIR=$(get_hwmon_dir "${CARD}")
# read all of the power profiles, used to get the IDs for assignment later
PROFILE_MODES=$(< /sys/class/drm/"${CARD}"/device/pp_power_profile_mode)
# get power capability; later used determine limits
read -r -d '' POWER_CAP < "$HWMON_DIR"/power1_cap_max
# export determinations
export CARD
export HWMON_DIR
export PROFILE_MODES
export POWER_CAP

View file

@ -4,12 +4,16 @@
include={{ item.1 }}
summary={{ item.1 }} + TCP/RAID tweaks + AMDGPU {{ item.0 }}
[variables]
include=/etc/tuned/amdgpu-profile-vars.conf
# reference/execute AMDGPU control script
# used because some sysfs interfaces are transactional
# cannot be set by a single param in 'tuned'/[sysfs]
[gpuclockscript]
type=script
script=${i:PROFILE_DIR}/amdgpu-clock.sh
script={{ (tuned_amdgpu_profile_dir, 'amdgpu-clock.sh') | ansible.builtin.path_join }}
{# call the state-managing script with the selected profile, item.0, as an argument #}
{% if plugins is defined %}
{% for section, options in plugins.items() %}
{#+ give each plugin section some space +#}