Archived
1
1
Fork 0

utils: make funcs require card, offer dict of discovery

This commit is contained in:
Josh Lay 2023-04-26 22:58:39 -05:00
parent 016cb6844e
commit 8d633cf391
Signed by: jlay
GPG key ID: B265E45CACAD108A

View file

@ -8,85 +8,64 @@ Variables:
- hwmon_dir: the `hwmon` interface (dir) that provides stats for this card - hwmon_dir: the `hwmon` interface (dir) that provides stats for this card
- SRC_FILES: dictionary of the known stats from the items in `hwmon_dir` - SRC_FILES: dictionary of the known stats from the items in `hwmon_dir`
- TEMP_FILES: dictionary of the *discovered* temperature nodes / stat files - TEMP_FILES: dictionary of the *discovered* temperature nodes / stat files
- POWER_DOMAINS: tuple of supported power domains: `average`, `limit`, `cap`, and `default`
- CLOCK_DOMAINS: tuple of supported clock domains: `core`, `memory` - CLOCK_DOMAINS: tuple of supported clock domains: `core`, `memory`
""" """
# disable superfluous linting # disable superfluous linting
# pylint: disable=line-too-long # pylint: disable=line-too-long
from os import path from os import path
import glob import glob
from typing import Tuple, Optional, Union from typing import Optional, Union
from humanfriendly import format_size from humanfriendly import format_size
def find_card() -> Optional[Tuple[Optional[str], Optional[str]]]: def find_cards() -> dict:
"""Searches contents of /sys/class/drm/card*/device/hwmon/hwmon*/name """Searches contents of `/sys/class/drm/card*/device/hwmon/hwmon*/name`
... looking for 'amdgpu' to find a card to monitor Reads 'hwmon' names looking for 'amdgpu' to find cards to monitor.
If no AMD GPU found, this will be: (None, None) If device(s) found, returns a dictionary of cards with their hwmon directories.
If *none* found, this will be an empty dict.
Returns: Returns:
tuple: ('cardN', '/hwmon/directory/with/stat/files') dict: `{'cardN': '/hwmon/directory/with/stat/files', 'cardY': '/other/hwmon/directory/for/cardY'}`
""" """
_card = None cards = {}
_hwmon_dir = None card_glob_pattern = '/sys/class/drm/card*/device/hwmon/hwmon*/name'
hwmon_names_glob = '/sys/class/drm/card*/device/hwmon/hwmon*/name' hwmon_names = glob.glob(card_glob_pattern)
hwmon_names = glob.glob(hwmon_names_glob)
for hwmon_name_file in hwmon_names: for hwmon_name_file in hwmon_names:
with open(hwmon_name_file, "r", encoding="utf-8") as _f: with open(hwmon_name_file, "r", encoding="utf-8") as _f:
if _f.read().strip() == 'amdgpu': if _f.read().strip() == 'amdgpu':
# found an amdgpu # found an amdgpu
# note: if multiple are found, last will be used/watched
# will be configurable in the future, may prompt
_card = hwmon_name_file.split('/')[4] _card = hwmon_name_file.split('/')[4]
_hwmon_dir = path.dirname(hwmon_name_file) _hwmon_dir = path.dirname(hwmon_name_file)
return _card, _hwmon_dir cards[_card] = _hwmon_dir
return cards
# base vars: card identifier, hwmon directory for stats, then the stat dicts # discover all available AMD GPUs
CARD, hwmon_dir = find_card() AMDGPU_CARDS = find_cards()
if CARD is not None: # supported clock domains by 'get_clock' func
card_dir = path.join("/sys/class/drm/", CARD) # eg: /sys/class/drm/card0/ # is concatenated with 'clock_' to index SRC_FILES for the relevant data file
CLOCK_DOMAINS = ('core', 'memory')
# dictionary of known source files # defined outside/globally for efficiency -- it's called a lot in the TUI
# ref: https://docs.kernel.org/gpu/amdgpu/thermal.html
SRC_FILES = {'pwr_limit': path.join(hwmon_dir, "power1_cap"),
'pwr_average': path.join(hwmon_dir, "power1_average"),
'pwr_cap': path.join(hwmon_dir, "power1_cap_max"),
'pwr_default': path.join(hwmon_dir, "power1_cap_default"),
'core_clock': path.join(hwmon_dir, "freq1_input"),
'core_voltage': path.join(hwmon_dir, "in0_input"),
'memory_clock': path.join(hwmon_dir, "freq2_input"),
'busy_pct': path.join(card_dir, "device/gpu_busy_percent"),
'temp_c': path.join(hwmon_dir, "temp1_input"),
'fan_rpm': path.join(hwmon_dir, "fan1_input"),
'fan_rpm_target': path.join(hwmon_dir, "fan1_target"),
}
# determine temperature nodes, construct a dict to store them
# interface will iterate over these, creating labels as needed
TEMP_FILES = {}
temp_node_labels = glob.glob(path.join(hwmon_dir, "temp*_label"))
for temp_node_label_file in temp_node_labels:
# determine the base node id, eg: temp1
# construct the path to the file that will label it. ie: edge/junction
temp_node_id = path.basename(temp_node_label_file).split('_')[0]
temp_node_value_file = path.join(hwmon_dir, f"{temp_node_id}_input")
with open(temp_node_label_file, 'r', encoding='utf-8') as _node:
temp_node_name = _node.read().strip()
# add the node name/type and the corresponding temp file to the dict
TEMP_FILES[temp_node_name] = temp_node_value_file
def read_stat(file: str) -> str: def read_stat(file: str, stat_type: Optional[str] = None) -> str:
"""Read statistic `file`, return the stripped contents """Read statistic `file`, return the stripped contents
Args:
file (str): The statistic file to read/return
stat_type (str): Optional type, if specified - can convert data.
Returns: Returns:
str: Statistics from `file`""" str: Statistics from `file`. If `stat_type='power'`, will convert mW to Watts"""
with open(file, "r", encoding="utf-8") as _fh: with open(file, "r", encoding="utf-8") as _fh:
data = _fh.read() data = _fh.read().strip()
return data.strip() if stat_type == 'power':
data = int(int(data) / 1000000)
return data
def format_frequency(frequency_hz: int) -> str: def format_frequency(frequency_hz: int) -> str:
@ -103,47 +82,35 @@ def format_frequency(frequency_hz: int) -> str:
) )
def get_power_stats() -> dict: def get_power_stats(card: str) -> dict:
""" """
Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Returns: Returns:
dict: A dictionary of current GPU *power* related statistics. dict: A dictionary of current GPU *power* related statistics.
Example: Example:
`{'limit': int, 'average': int, 'capability': int, 'default': int}` `{'limit': int, 'average': int, 'capability': int, 'default': int}`
""" """
return {"limit": get_gpu_power('limit'), if card in AMDGPU_CARDS:
"average": get_gpu_power('average'), hwmon_dir = AMDGPU_CARDS[card]
"capability": get_gpu_power('cap'), else:
"default": get_gpu_power('default')} if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
return {"limit": read_stat(path.join(hwmon_dir, "power1_cap"), stat_type='power'),
"average": read_stat(path.join(hwmon_dir, "power1_average"), stat_type='power'),
"capability": read_stat(path.join(hwmon_dir, "power1_cap_max"), stat_type='power'),
"default": read_stat(path.join(hwmon_dir, "power1_cap_default"), stat_type='power')}
# constant; supported power domains by 'get_gpu_power' func def get_core_stats(card: str) -> dict:
# is concatenated with 'pwr_' to index SRC_FILES for the relevant data file
POWER_DOMAINS = ('limit', 'average', 'cap', 'default')
# defined outside/globally for efficiency -- it's called a lot in the TUI
def get_gpu_power(domain: str) -> int:
""" """
Args: Args:
domain (str): The GPU domain of interest regarding power card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Must be one of POWER_DOMAINS:
- limit: the effective limit placed on the card
- default: the default limit
- average: the average consumption
- cap: the board capability
Returns:
int: The requested GPU power statistic by domain, in Watts
"""
if domain not in POWER_DOMAINS:
raise ValueError(f"Invalid power domain: '{domain}'. Must be one of: {POWER_DOMAINS}")
return int(int(read_stat(SRC_FILES['pwr_' + domain])) / 1000000)
def get_core_stats() -> dict:
"""
Returns: Returns:
dict: A dictionary of current GPU *core/memory* related statistics. dict: A dictionary of current GPU *core/memory* related statistics.
@ -152,21 +119,22 @@ def get_core_stats() -> dict:
Example: Example:
`{'sclk': int, 'mclk': int, 'voltage': float, 'util_pct': int}` `{'sclk': int, 'mclk': int, 'voltage': float, 'util_pct': int}`
""" """
return {"sclk": get_clock('core'), # verify card -- is it AMD, do we know the hwmon directory?
"mclk": get_clock('memory'), if card in AMDGPU_CARDS:
"voltage": get_voltage(), return {"sclk": get_clock(card, 'core'),
"util_pct": get_gpu_usage()} "mclk": get_clock(card, 'memory'),
"voltage": get_voltage(card),
"util_pct": get_gpu_usage(card)}
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
# constant; supported clock domains by 'get_clock' func def get_clock(card: str, domain: str, format_freq: bool = False) -> Union[int, str]:
# is concatenated with 'clock_' to index SRC_FILES for the relevant data file
CLOCK_DOMAINS = ('core', 'memory')
# defined outside/globally for efficiency -- it's called a lot in the TUI
def get_clock(domain: str, format_freq: bool = False) -> Union[int, str]:
""" """
Args: Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
domain (str): The GPU domain of interest regarding clock speed. domain (str): The GPU domain of interest regarding clock speed.
Must be one of CLOCK_DOMAINS Must be one of CLOCK_DOMAINS
@ -178,59 +146,102 @@ def get_clock(domain: str, format_freq: bool = False) -> Union[int, str]:
If format_freq is True, a formatted string with Hz/MHz/GHz If format_freq is True, a formatted string with Hz/MHz/GHz
will be returned instead of an int will be returned instead of an int
""" """
# verify card -- is it AMD, do we know the hwmon directory?
if card in AMDGPU_CARDS:
hwmon_dir = AMDGPU_CARDS[card]
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
if domain not in CLOCK_DOMAINS: if domain not in CLOCK_DOMAINS:
raise ValueError(f"Invalid clock domain: '{domain}'. Must be one of: {CLOCK_DOMAINS}") raise ValueError(f"Invalid clock domain: '{domain}'. Must be one of: {CLOCK_DOMAINS}")
# set the clock file based on requested domain
if domain == 'core':
clock_file = path.join(hwmon_dir, "freq1_input")
elif domain == 'memory':
clock_file = path.join(hwmon_dir, "freq2_input")
# handle output processing
if format_freq: if format_freq:
return format_frequency(read_stat(SRC_FILES[domain + '_clock'])) return format_frequency(int(read_stat(clock_file)))
return int(read_stat(SRC_FILES[domain + '_clock'])) return int(read_stat(clock_file))
def get_voltage() -> float: def get_voltage(card: str) -> float:
""" """
Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Returns: Returns:
float: The current GPU core voltage float: The current GPU core voltage
""" """
return round(int(read_stat(SRC_FILES['core_voltage'])) / 1000.0, 2) # verify card -- is it AMD, do we know the hwmon directory?
if card in AMDGPU_CARDS:
hwmon_dir = AMDGPU_CARDS[card]
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
return round(int(read_stat(path.join(hwmon_dir, "in0_input"))) / 1000.0, 2)
def get_fan_stats() -> dict: def get_fan_rpm(card: str) -> int:
""" """
Returns: Args:
dict: A dictionary of current GPU *fan* related statistics. card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Example:
`{'fan_rpm': int, 'fan_rpm_target': int}`
"""
return {"fan_rpm": get_fan_rpm(),
"fan_rpm_target": get_fan_target()}
def get_fan_rpm() -> int:
"""
Returns: Returns:
int: The current fan RPM int: The current fan RPM
""" """
return int(read_stat(SRC_FILES['fan_rpm'])) # verify card -- is it AMD, do we know the hwmon directory?
if card in AMDGPU_CARDS:
hwmon_dir = AMDGPU_CARDS[card]
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
return int(read_stat(path.join(hwmon_dir, "fan1_input")))
def get_fan_target() -> int: def get_fan_target(card: str) -> int:
""" """
Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Returns: Returns:
int: The current fan RPM int: The current fan RPM
""" """
return int(read_stat(SRC_FILES['fan_rpm_target'])) # verify card -- is it AMD, do we know the hwmon directory?
if card in AMDGPU_CARDS:
hwmon_dir = AMDGPU_CARDS[card]
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
return int(read_stat(path.join(hwmon_dir, "fan1_target")))
def get_gpu_usage() -> int: def get_gpu_usage(card: str) -> int:
""" """
Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Returns: Returns:
int: The current GPU usage/utilization as a percentage int: The current GPU usage/utilization as a percentage
""" """
return int(read_stat(SRC_FILES['busy_pct'])) if card in AMDGPU_CARDS:
stat_file = path.join("/sys/class/drm/", card, "device/gpu_busy_percent")
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
return int(read_stat(stat_file))
def get_temp_stats() -> dict: def get_temp_stats(card: str) -> dict:
""" """
Args:
card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()`
Returns: Returns:
dict: A dictionary of current GPU *temperature* related statistics. dict: A dictionary of current GPU *temperature* related statistics.
@ -243,8 +254,28 @@ def get_temp_stats() -> dict:
Returned values are converted to C, as integers for simple comparison Returned values are converted to C, as integers for simple comparison
""" """
if card in AMDGPU_CARDS:
hwmon_dir = AMDGPU_CARDS[card]
else:
if len(AMDGPU_CARDS) > 0:
raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}")
raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found")
# determine temperature nodes, construct a dict to store them
# interface will iterate over these, creating labels as needed
temp_files = {}
temp_node_labels = glob.glob(path.join(hwmon_dir, "temp*_label"))
for temp_node_label_file in temp_node_labels:
# determine the base node id, eg: temp1
# construct the path to the file that will label it. ie: edge/junction
temp_node_id = path.basename(temp_node_label_file).split('_')[0]
temp_node_value_file = path.join(hwmon_dir, f"{temp_node_id}_input")
with open(temp_node_label_file, 'r', encoding='utf-8') as _node:
temp_node_name = _node.read().strip()
# add the node name/type and the corresponding temp file to the dict
temp_files[temp_node_name] = temp_node_value_file
temp_update = {} temp_update = {}
for temp_node, temp_file in TEMP_FILES.items(): for temp_node, temp_file in temp_files.items():
# iterate through the discovered temperature nodes # iterate through the discovered temperature nodes
# ... updating the dictionary with new stats # ... updating the dictionary with new stats
_temperature = int(int(read_stat(temp_file)) // 1000) _temperature = int(int(read_stat(temp_file)) // 1000)