From f780ca20eee43938687414e808498fea9e673dee Mon Sep 17 00:00:00 2001 From: Josh Lay Date: Sun, 12 Jun 2022 21:20:10 -0500 Subject: [PATCH] use (one source template) script for *all* GPU control --- playbook.yml | 26 +++++++++--------- templates/amdgpu-clock-reset.sh.j2 | 12 --------- templates/amdgpu-clock.sh.j2 | 42 +++++++++++++++++++++++++----- templates/tuned.conf.j2 | 30 +++------------------ 4 files changed, 50 insertions(+), 60 deletions(-) delete mode 100644 templates/amdgpu-clock-reset.sh.j2 diff --git a/playbook.yml b/playbook.yml index 155d386..6b41cef 100644 --- a/playbook.yml +++ b/playbook.yml @@ -60,6 +60,15 @@ patterns: - '^power1_cap_max$' register: hwmon + - name: find hwmon/current power limit file for {{ card }} + find: + paths: /sys/class/drm/{{ card }}/device/hwmon + file_type: file + recurse: yes + use_regex: yes + patterns: + - '^power1_cap$' + register: powercap_set - name: get max power capability for {{ card }} slurp: src: "{{ hwmon.files.0.path }}" @@ -72,26 +81,15 @@ with_nested: - "{{ amdgpu_profiles }}" - "{{ base_profiles }}" - - name: template AMDGPU clock control scripts (tuned profile dependency) + - name: template AMDGPU clock control scripts (default/GPU profile dependency) template: src: templates/amdgpu-clock.sh.j2 - dest: /etc/tuned/{{ item.1 }}-amdgpu-{{ item.0 }}/amdgpu-clock.sh + dest: /etc/tuned/{{ item.1 }}-amdgpu-{{ item.0.name }}/amdgpu-clock.sh owner: root group: root mode: "0755" with_nested: - - 'custom' - - "{{ base_profiles }}" - notify: restart tuned - - name: template AMDGPU clock control *reset* script (tuned profile dependency) - template: - src: templates/amdgpu-clock-reset.sh.j2 - dest: /etc/tuned/{{ item.1 }}-amdgpu-{{ item.0 }}/amdgpu-clock-reset.sh - owner: root - group: root - mode: "0755" - with_nested: - - 'default' + - "{{ amdgpu_profiles }}" - "{{ base_profiles }}" notify: restart tuned - name: template custom tuned profiles diff --git a/templates/amdgpu-clock-reset.sh.j2 b/templates/amdgpu-clock-reset.sh.j2 deleted file mode 100644 index 5e9b34b..0000000 --- a/templates/amdgpu-clock-reset.sh.j2 +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# script to reset tuned's AMDGPU clock control to default -# -# rendered by Ansible with environment-appropriate values: -# card #, eg: card0 -# min/max GPU clocks - -# set control mode back to auto -echo 'auto' | tee /sys/class/drm/{{ card }}/device/power_dpm_force_performance_level - -# reset any existing profile clock changes -echo 'r' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage diff --git a/templates/amdgpu-clock.sh.j2 b/templates/amdgpu-clock.sh.j2 index 3ffc652..8391e46 100644 --- a/templates/amdgpu-clock.sh.j2 +++ b/templates/amdgpu-clock.sh.j2 @@ -1,19 +1,46 @@ #!/bin/bash # script for tuned AMDGPU clock control -# clocks in 3D usage are dynamic based on need/usage -# -# this sets the minimums / maximums +# configures GPU power/clock characteristics +# clocks/power in 3D are dynamic based on need/usage # # rendered by Ansible with environment-appropriate values: # card #, eg: card0 -# min/max GPU clocks +# path to discovered sysfs device files (power/clock/voltage control) +# +# this sets the minimums / maximums for a specific generation of GPU +# settings may need adjusted +# +# AMDGPU driver/sysfs references: +# https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html +# https://docs.kernel.org/gpu/amdgpu/thermal.html -# send a reset for a clean slate -# echo 'r' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +{% if 'default' in item.0.name %} +# set power state transition heuristics to default +echo '{{ item.0.value }}' | tee /sys/class/drm/{{ card }}/device/pp_power_profile_mode +# set control mode back to auto +# attempts to dynamically set optimal power profile for conditions +echo 'auto' | tee /sys/class/drm/{{ card }}/device/power_dpm_force_performance_level + +# reset any existing profile clock changes +echo 'r' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage + +# give default profile {{ power_max_default_multi * 100.0 |int }}% of the max power capability +# {{ power_cap_default|int/1000000 }} Watts of {{ power_max|int/1000000 }} total +echo '{{ power_cap_default }}' | tee {{ powercap_set.files.0.path }} +{% elif 'custom' in item.0.name %} # set manual control mode +# allow control via 'pp_dpm_mclk', 'pp_dpm_sclk', 'pp_dpm_pcie', and 'pp_power_profile_mode' files echo 'manual' | tee /sys/class/drm/{{ card }}/device/power_dpm_force_performance_level +# set power state transition heuristics to custom/manual +# looked up from amdgpu_profiles variable using 'with_nested' loop in task +echo '{{ item.0.value }}' | tee /sys/class/drm/{{ card }}/device/pp_power_profile_mode + +# give this profile {{ power_max_custom_multi * 100.0 |int }}% of the max power capability +# {{ power_cap_custom|int/1000000 }} Watts of {{ power_max|int/1000000 }} total +echo '{{ power_cap_custom }}' | tee {{ powercap_set.files.0.path }} + # set the minimum GPU clock echo 's 0 {{ gpu_clock_min }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage @@ -22,8 +49,8 @@ echo 's 1 {{ gpu_clock_max }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_ # set the maximum GPU *memory* clock echo 'm 1 {{ gpumem_clock_max }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage - {% if gpu_mv_offset is defined %} + # offset GPU voltage {{ gpu_mv_offset }}mV echo 'vo {{ gpu_mv_offset }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage {% endif %} @@ -33,3 +60,4 @@ echo 'c' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage # force GPU memory into highest profile echo '3' | tee /sys/class/drm/{{ card }}/device/pp_dpm_mclk +{% endif %} diff --git a/templates/tuned.conf.j2 b/templates/tuned.conf.j2 index 6ebe1c2..8e3e9ca 100644 --- a/templates/tuned.conf.j2 +++ b/templates/tuned.conf.j2 @@ -14,33 +14,9 @@ dev.raid.speed_limit_max=9000000 # allow some games to run (eg: DayZ) vm.max_map_count=1048576 -{% if 'default' in item.0.name %} -# reference/execute AMDGPU clock control *reset* script -[gpuresetscript] -type=script -script=${i:PROFILE_DIR}/amdgpu-clock-reset.sh - -[sysfs] -# configure GPU power/clock characteristics -# ref: https://docs.kernel.org/gpu/amdgpu/thermal.html -/sys/class/drm/{{ card }}/device/pp_power_profile_mode = {{ item.0.value }} -/sys/class/drm/{{ card }}/device/power_dpm_force_performance_level = auto -{# # give default profile {{ power_max_default_multi * 100.0 |int }}% of the max power capability -- {{ power_max_float * power_max_default_multi / 1000000.0 |int }} Watts of {{ power_max / 1000000 |int }} total #} -# give default profile {{ power_max_default_multi * 100.0 |int }}% of the max power capability -# {{ power_cap_default|int/1000000 }} Watts of {{ power_max|int/1000000 }} total -/sys/class/drm/{{ card }}/device/hwmon/hwmon9/power1_cap = {{ power_cap_default }} -{% endif %} -{% if 'custom' in item.0.name %} -[sysfs] -# configure GPU power/clock characteristics -# ref: https://docs.kernel.org/gpu/amdgpu/thermal.html -/sys/class/drm/{{ card }}/device/pp_power_profile_mode = {{ item.0.value }} -# give this custom oriented profile {{ power_max_custom_multi * 100.0 |int }}% of the max power capability -# {{ power_cap_custom|int/1000000 }} Watts of {{ power_max|int/1000000 }} total -/sys/class/drm/{{ card }}/device/hwmon/hwmon9/power1_cap = {{ power_cap_custom }} - -# reference/execute AMDGPU clock control script +# reference/execute AMDGPU control script +# used because some sysfs interfaces are transactional +# cannot be set by a single param in 'tuned'/[sysfs] [gpuclockscript] type=script script=${i:PROFILE_DIR}/amdgpu-clock.sh -{% endif %}