diff --git a/host_vars/localhost.yml b/host_vars/localhost.yml new file mode 100644 index 0000000..f753b94 --- /dev/null +++ b/host_vars/localhost.yml @@ -0,0 +1,52 @@ +--- +# statically defined mapping of the contents in /sys/class/drm/card*/device/pp_power_profile_mode +# more may be added, but do not remove default/custom. new profiles require a script template, see 'templates' +amdgpu_profiles: + default: + pwrmode: 0 + 3D: + pwrmode: 1 + VR: + pwrmode: 4 + custom: + pwrmode: 6 + +# the multipliers against power capability to determine power limits for the non-OC (default)/OC (custom) profiles +# 0.5 = 50%, 1.0 = 100% (of card power capability, not stock limits) +# +# should not exceed 1.0, must be a float. driver will do some rounding/stepping +# +# see 'power_max multi tab calculator.ods' for a calculator/spreadsheet +# adjust cell F14 (board max power) and the 'effective watts' column to update calculations +# microWatt board power capability can be discovered like so: 'cat /sys/class/drm/card*/device/hwmon/hwmon*/power1_cap_max' +# power_max_default_multi: 0.75 # 242.25W, slightly lower than true default +# power for the default profile +power_max_default_multi: 0.789473684210526 # 255W - default +# power_max_default_multi: 0.820433436532508 # 265W +# power_max_default_multi: 0.851393188854489 # 275W +# power_max_default_multi: 0.869969040247678 # 281W + +# power for the custom profile +#power_max_custom_multi: 0.789473684210526 # 255W - default +power_max_custom_multi: 0.869969040247678 # 281W +# alt default power limits +# power_max_default_multi: 0.696594427244582 # 225W +# power_max_default_multi: 0.869969040247678 # 281W +# +# minimum/maximum GPU clocks using 'powerplay' below +# these do *not* apply to the resulting 'amdgpu-default' tuned profile +# +# only the others (eg: 'VR'/'custom') under amdgpu_profiles (below) +# effective clocks are dynamically adjusted by the driver in this range - based on utilization +# can find baseline values in the '/sys/class/drm/{{ card }}/device/pp_od_clk_voltage' file +# OD_RANGE indicates acceptable value ranges for SCLK (core) and MCLK (memory) +# Under 'OD_SCLK' and 'OD_MCLK' you will see 0/1. These are the minimum/maximum values for the respective component. +gpu_clock_min: "500" # default 500 +gpu_clock_max: "2500" # default 2529 +# +# memory clocks are static, we only set a 'max' - high-refresh rate display quirk +# this allows the memory clock to be adjusted +gpumem_clock_max: "1050" +# +# optional, applies offset to GPU voltage, eg: '+100' = to boost GPU core voltage 100mV or 0.1V. for the 'custom' GPU profile. +#gpu_mv_offset: "+50" diff --git a/inventories/localhost.yml b/inventories/localhost.yml new file mode 100644 index 0000000..16c7496 --- /dev/null +++ b/inventories/localhost.yml @@ -0,0 +1,5 @@ +--- +lab: + hosts: + localhost: + ansible_connection: local diff --git a/playbook.yml b/playbook.yml index c7e96a6..47f3e99 100644 --- a/playbook.yml +++ b/playbook.yml @@ -1,36 +1,7 @@ --- - hosts: localhost - become: yes + become: true vars: - # the multipliers against power capability to determine power limits for the non-OC (default)/OC (custom) profiles - # 0.5 = 50% - # 1.0 = 100% (of card power capability, not stock limits) - # should not exceed 1.0, must be a float. driver will do some rounding/stepping - # default is ~87% (281/323) on my 6900XT. check '/sys/class/drm/card*/device/hwmon/hwmon*/power1_cap_default' - power_max_custom_multi: 0.928793 # calculated to ~300W. used to control the effective power limit in the non-default AMDGPU tuned profiles - power_max_default_multi: 0.82 # (typically) used to limit GPU power to some lower percentage on default perf mode/profile - # - # minimum/maximum GPU clocks using 'powerplay' - # these do not apply to the resulting 'amdgpu-default' tuned profile - # only the others (eg: 'VR'/'custom') under amdgpu_profiles (below) - # effective clocks are dynamically adjusted by the driver in this range - based on utilization - # can find baseline values in the '/sys/class/drm/{{ card }}/device/pp_od_clk_voltage' file - # OD_RANGE indicates acceptable value ranges for SCLK (core) and MCLK (memory) - # Under 'OD_SCLK' and 'OD_MCLK' you will see 0/1. These are the minimum/maximum values for the respective component. - gpu_clock_min: "2000" - gpu_clock_max: "2615" - # - # memory clocks are static, we only set a 'max' - high-refresh rate display quirk - # this allows the memory clock to be adjusted - gpumem_clock_max: "1075" - # - # optional, applies offset to GPU voltage, eg: '+100' = to boost GPU core voltage 100mV or 0.1V. for the 'custom' GPU profile. - gpu_mv_offset: "-25" - # - # the card for 'tuned' to control - # this is *usually* 'card0', but may differ. - # check '/dev/dri/card*' - card: 'card0' # default to card0 # list of source tuned profiles available on Fedora (TODO: should dynamically discover) # further modified with AMD GPU power/clock parameters, creating new profiles. eg: 'balanced-amdgpu-VR' base_profiles: @@ -41,13 +12,6 @@ - network-throughput - powersave - virtual-host - amdgpu_profiles: # statically defined mapping of the contents in /sys/class/drm/card*/device/pp_power_profile_mode - default: # more may be added, but do not remove default/custom. new profiles require a script template, see 'templates' - pwrmode: 0 - VR: - pwrmode: 4 - custom: - pwrmode: 6 handlers: - name: restart tuned ansible.builtin.service: @@ -64,7 +28,10 @@ with_items: - {name: 'power-profiles-daemon', state: 'absent'} - {name: 'tuned', state: 'present'} - when: ('power-profiles-daemon' in ansible_facts.packages) or (('tuned' not in ansible_facts.packages) and ((ansible_distribution == 'Fedora') and (ansible_distribution_major_version|int > 35))) + when: + - ('power-profiles-daemon' in ansible_facts.packages) or ('tuned' not in ansible_facts.packages) + - ansible_distribution == 'Fedora' + - ansible_distribution_major_version|int > 35 register: fed_ppdtuned_swap # 'power-profiles-daemon' was added/conflicts with 'tuned' since F35 # otherwise, ensure the 'tuned' package is installed @@ -73,25 +40,31 @@ name: tuned state: present when: (fed_ppdtuned_swap is not defined) or ('tuned' not in ansible_facts.packages) - - name: find hwmon/max power capability file for {{ card }} + - name: determine GPU device in drm subsystem + shell: + cmd: grep -ls ^connected /sys/class/drm/*/status | grep -o card[0-9] | sort | uniq | tail -1 + executable: /bin/bash + changed_when: false + register: card + - name: find hwmon/max power capability file for {{ card.stdout }} find: - paths: /sys/class/drm/{{ card }}/device/hwmon + paths: /sys/class/drm/{{ card.stdout }}/device/hwmon file_type: file - recurse: yes - use_regex: yes + recurse: true + use_regex: true patterns: - '^power1_cap_max$' register: hwmon - - name: find hwmon/current power limit file for {{ card }} + - name: find hwmon/current power limit file for {{ card.stdout }} find: - paths: /sys/class/drm/{{ card }}/device/hwmon + paths: /sys/class/drm/{{ card.stdout }}/device/hwmon file_type: file - recurse: yes - use_regex: yes + recurse: true + use_regex: true patterns: - '^power1_cap$' register: powercap_set - - name: get max power capability for {{ card }} + - name: get max power capability for {{ card.stdout }} slurp: src: "{{ hwmon.files.0.path }}" register: power_max_b64 @@ -128,4 +101,4 @@ - name: ensure tuned is enabled service: name: tuned - enabled: yes + enabled: true diff --git a/templates/amdgpu-clock.sh.j2 b/templates/amdgpu-clock.sh.j2 index 2f5ec2b..918eb5a 100644 --- a/templates/amdgpu-clock.sh.j2 +++ b/templates/amdgpu-clock.sh.j2 @@ -13,18 +13,23 @@ # AMDGPU driver/sysfs references: # https://01.org/linuxgraphics/gfx-docs/drm/gpu/amdgpu.html # https://docs.kernel.org/gpu/amdgpu/thermal.html + +{# done this way to avoid issues with the card number possibly shifting after playbook run #} +# dynamically determine the connected GPU using the DRM subsystem +CARD=$(/usr/bin/grep -ls ^connected /sys/class/drm/*/status | /usr/bin/grep -o 'card[0-9]' | /usr/bin/sort | /usr/bin/uniq | /usr/bin/tail -1) + {# begin the templated script for 'default' profiles to reset state #} {% if 'default' in item.0.key %} # set power state transition heuristics to default -echo '{{ item.0.value.pwrmode }}' | tee /sys/class/drm/{{ card }}/device/pp_power_profile_mode +echo '{{ item.0.value.pwrmode }}' | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode # set control mode back to auto # attempts to dynamically set optimal power profile for (load) conditions -echo 'auto' | tee /sys/class/drm/{{ card }}/device/power_dpm_force_performance_level +echo 'auto' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level # reset any existing profile clock changes -echo 'r' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 'r' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage # give default profile {{ power_max_default_multi * 100.0 |int }}% of the max power capability # {{ power_cap_default|int/1000000 }} Watts of {{ power_max|int/1000000 }} total @@ -36,34 +41,34 @@ echo '{{ power_cap_default }}' | tee {{ powercap_set.files.0.path }} # allows control via 'pp_dpm_mclk', 'pp_dpm_sclk', 'pp_dpm_pcie', 'pp_dpm_fclk', and 'pp_power_profile_mode' files # only interested in 'pp_power_profile_mode' for power and 'pp_dpm_mclk' for memory clock (flickering). # GPU clocks are dynamic based on (load) condition -echo 'manual' | tee /sys/class/drm/{{ card }}/device/power_dpm_force_performance_level +echo 'manual' | tee /sys/class/drm/"${CARD}"/device/power_dpm_force_performance_level # set power state transition heuristics to '{{ item.0.key }}' profile -echo '{{ item.0.value.pwrmode }}' | tee /sys/class/drm/{{ card }}/device/pp_power_profile_mode +echo '{{ item.0.value.pwrmode }}' | tee /sys/class/drm/"${CARD}"/device/pp_power_profile_mode # give this profile {{ power_max_custom_multi * 100.0 |int }}% of the max power capability # {{ power_cap_custom|int/1000000 }} Watts of {{ power_max|int/1000000 }} total echo '{{ power_cap_custom }}' | tee {{ powercap_set.files.0.path }} # set the minimum GPU clock -echo 's 0 {{ gpu_clock_min }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 's 0 {{ gpu_clock_min }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage # set the maximum GPU clock -echo 's 1 {{ gpu_clock_max }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 's 1 {{ gpu_clock_max }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage # set the maximum GPU *memory* clock -echo 'm 1 {{ gpumem_clock_max }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 'm 1 {{ gpumem_clock_max }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage {% if gpu_mv_offset is defined %} # offset GPU voltage {{ gpu_mv_offset }}mV -echo 'vo {{ gpu_mv_offset }}' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 'vo {{ gpu_mv_offset }}' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage {% endif %} # commit the changes -echo 'c' | tee /sys/class/drm/{{ card }}/device/pp_od_clk_voltage +echo 'c' | tee /sys/class/drm/"${CARD}"/device/pp_od_clk_voltage # force GPU memory into highest clock (fix flickering) # pp_dpm_*clk settings are unintuitive, giving profiles that may be used # opt not to set the others (eg: sclk/fclk) - those should remain for benefits from the curve -echo '3' | tee /sys/class/drm/{{ card }}/device/pp_dpm_mclk +echo '3' | tee /sys/class/drm/"${CARD}"/device/pp_dpm_mclk {% endif %}