From 8d633cf391671ea285d4978e88e213bc79eab804 Mon Sep 17 00:00:00 2001 From: Josh Lay Date: Wed, 26 Apr 2023 22:58:39 -0500 Subject: [PATCH] utils: make funcs require card, offer dict of discovery --- src/amdgpu_stats/utils.py | 259 +++++++++++++++++++++----------------- 1 file changed, 145 insertions(+), 114 deletions(-) diff --git a/src/amdgpu_stats/utils.py b/src/amdgpu_stats/utils.py index eff62be..a9022fe 100644 --- a/src/amdgpu_stats/utils.py +++ b/src/amdgpu_stats/utils.py @@ -8,85 +8,64 @@ Variables: - hwmon_dir: the `hwmon` interface (dir) that provides stats for this card - SRC_FILES: dictionary of the known stats from the items in `hwmon_dir` - TEMP_FILES: dictionary of the *discovered* temperature nodes / stat files - - POWER_DOMAINS: tuple of supported power domains: `average`, `limit`, `cap`, and `default` - CLOCK_DOMAINS: tuple of supported clock domains: `core`, `memory` """ # disable superfluous linting # pylint: disable=line-too-long from os import path import glob -from typing import Tuple, Optional, Union +from typing import Optional, Union from humanfriendly import format_size -def find_card() -> Optional[Tuple[Optional[str], Optional[str]]]: - """Searches contents of /sys/class/drm/card*/device/hwmon/hwmon*/name +def find_cards() -> dict: + """Searches contents of `/sys/class/drm/card*/device/hwmon/hwmon*/name` - ... looking for 'amdgpu' to find a card to monitor + Reads 'hwmon' names looking for 'amdgpu' to find cards to monitor. - If no AMD GPU found, this will be: (None, None) + If device(s) found, returns a dictionary of cards with their hwmon directories. + + If *none* found, this will be an empty dict. Returns: - tuple: ('cardN', '/hwmon/directory/with/stat/files') + dict: `{'cardN': '/hwmon/directory/with/stat/files', 'cardY': '/other/hwmon/directory/for/cardY'}` """ - _card = None - _hwmon_dir = None - hwmon_names_glob = '/sys/class/drm/card*/device/hwmon/hwmon*/name' - hwmon_names = glob.glob(hwmon_names_glob) + cards = {} + card_glob_pattern = '/sys/class/drm/card*/device/hwmon/hwmon*/name' + hwmon_names = glob.glob(card_glob_pattern) for hwmon_name_file in hwmon_names: with open(hwmon_name_file, "r", encoding="utf-8") as _f: if _f.read().strip() == 'amdgpu': # found an amdgpu - # note: if multiple are found, last will be used/watched - # will be configurable in the future, may prompt _card = hwmon_name_file.split('/')[4] _hwmon_dir = path.dirname(hwmon_name_file) - return _card, _hwmon_dir + cards[_card] = _hwmon_dir + return cards -# base vars: card identifier, hwmon directory for stats, then the stat dicts -CARD, hwmon_dir = find_card() -if CARD is not None: - card_dir = path.join("/sys/class/drm/", CARD) # eg: /sys/class/drm/card0/ - - # dictionary of known source files - # ref: https://docs.kernel.org/gpu/amdgpu/thermal.html - SRC_FILES = {'pwr_limit': path.join(hwmon_dir, "power1_cap"), - 'pwr_average': path.join(hwmon_dir, "power1_average"), - 'pwr_cap': path.join(hwmon_dir, "power1_cap_max"), - 'pwr_default': path.join(hwmon_dir, "power1_cap_default"), - 'core_clock': path.join(hwmon_dir, "freq1_input"), - 'core_voltage': path.join(hwmon_dir, "in0_input"), - 'memory_clock': path.join(hwmon_dir, "freq2_input"), - 'busy_pct': path.join(card_dir, "device/gpu_busy_percent"), - 'temp_c': path.join(hwmon_dir, "temp1_input"), - 'fan_rpm': path.join(hwmon_dir, "fan1_input"), - 'fan_rpm_target': path.join(hwmon_dir, "fan1_target"), - } - - # determine temperature nodes, construct a dict to store them - # interface will iterate over these, creating labels as needed - TEMP_FILES = {} - temp_node_labels = glob.glob(path.join(hwmon_dir, "temp*_label")) - for temp_node_label_file in temp_node_labels: - # determine the base node id, eg: temp1 - # construct the path to the file that will label it. ie: edge/junction - temp_node_id = path.basename(temp_node_label_file).split('_')[0] - temp_node_value_file = path.join(hwmon_dir, f"{temp_node_id}_input") - with open(temp_node_label_file, 'r', encoding='utf-8') as _node: - temp_node_name = _node.read().strip() - # add the node name/type and the corresponding temp file to the dict - TEMP_FILES[temp_node_name] = temp_node_value_file +# discover all available AMD GPUs +AMDGPU_CARDS = find_cards() +# supported clock domains by 'get_clock' func +# is concatenated with 'clock_' to index SRC_FILES for the relevant data file +CLOCK_DOMAINS = ('core', 'memory') +# defined outside/globally for efficiency -- it's called a lot in the TUI -def read_stat(file: str) -> str: +def read_stat(file: str, stat_type: Optional[str] = None) -> str: """Read statistic `file`, return the stripped contents + Args: + file (str): The statistic file to read/return + + stat_type (str): Optional type, if specified - can convert data. + Returns: - str: Statistics from `file`""" + str: Statistics from `file`. If `stat_type='power'`, will convert mW to Watts""" with open(file, "r", encoding="utf-8") as _fh: - data = _fh.read() - return data.strip() + data = _fh.read().strip() + if stat_type == 'power': + data = int(int(data) / 1000000) + return data def format_frequency(frequency_hz: int) -> str: @@ -103,47 +82,35 @@ def format_frequency(frequency_hz: int) -> str: ) -def get_power_stats() -> dict: +def get_power_stats(card: str) -> dict: """ + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + Returns: dict: A dictionary of current GPU *power* related statistics. Example: `{'limit': int, 'average': int, 'capability': int, 'default': int}` """ - return {"limit": get_gpu_power('limit'), - "average": get_gpu_power('average'), - "capability": get_gpu_power('cap'), - "default": get_gpu_power('default')} + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + + return {"limit": read_stat(path.join(hwmon_dir, "power1_cap"), stat_type='power'), + "average": read_stat(path.join(hwmon_dir, "power1_average"), stat_type='power'), + "capability": read_stat(path.join(hwmon_dir, "power1_cap_max"), stat_type='power'), + "default": read_stat(path.join(hwmon_dir, "power1_cap_default"), stat_type='power')} -# constant; supported power domains by 'get_gpu_power' func -# is concatenated with 'pwr_' to index SRC_FILES for the relevant data file -POWER_DOMAINS = ('limit', 'average', 'cap', 'default') -# defined outside/globally for efficiency -- it's called a lot in the TUI - - -def get_gpu_power(domain: str) -> int: +def get_core_stats(card: str) -> dict: """ Args: - domain (str): The GPU domain of interest regarding power + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` - Must be one of POWER_DOMAINS: - - limit: the effective limit placed on the card - - default: the default limit - - average: the average consumption - - cap: the board capability - - Returns: - int: The requested GPU power statistic by domain, in Watts - """ - if domain not in POWER_DOMAINS: - raise ValueError(f"Invalid power domain: '{domain}'. Must be one of: {POWER_DOMAINS}") - return int(int(read_stat(SRC_FILES['pwr_' + domain])) / 1000000) - - -def get_core_stats() -> dict: - """ Returns: dict: A dictionary of current GPU *core/memory* related statistics. @@ -152,21 +119,22 @@ def get_core_stats() -> dict: Example: `{'sclk': int, 'mclk': int, 'voltage': float, 'util_pct': int}` """ - return {"sclk": get_clock('core'), - "mclk": get_clock('memory'), - "voltage": get_voltage(), - "util_pct": get_gpu_usage()} + # verify card -- is it AMD, do we know the hwmon directory? + if card in AMDGPU_CARDS: + return {"sclk": get_clock(card, 'core'), + "mclk": get_clock(card, 'memory'), + "voltage": get_voltage(card), + "util_pct": get_gpu_usage(card)} + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") -# constant; supported clock domains by 'get_clock' func -# is concatenated with 'clock_' to index SRC_FILES for the relevant data file -CLOCK_DOMAINS = ('core', 'memory') -# defined outside/globally for efficiency -- it's called a lot in the TUI - - -def get_clock(domain: str, format_freq: bool = False) -> Union[int, str]: +def get_clock(card: str, domain: str, format_freq: bool = False) -> Union[int, str]: """ Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + domain (str): The GPU domain of interest regarding clock speed. Must be one of CLOCK_DOMAINS @@ -178,59 +146,102 @@ def get_clock(domain: str, format_freq: bool = False) -> Union[int, str]: If format_freq is True, a formatted string with Hz/MHz/GHz will be returned instead of an int """ + # verify card -- is it AMD, do we know the hwmon directory? + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") if domain not in CLOCK_DOMAINS: raise ValueError(f"Invalid clock domain: '{domain}'. Must be one of: {CLOCK_DOMAINS}") + # set the clock file based on requested domain + if domain == 'core': + clock_file = path.join(hwmon_dir, "freq1_input") + elif domain == 'memory': + clock_file = path.join(hwmon_dir, "freq2_input") + # handle output processing if format_freq: - return format_frequency(read_stat(SRC_FILES[domain + '_clock'])) - return int(read_stat(SRC_FILES[domain + '_clock'])) + return format_frequency(int(read_stat(clock_file))) + return int(read_stat(clock_file)) -def get_voltage() -> float: +def get_voltage(card: str) -> float: """ + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + Returns: float: The current GPU core voltage """ - return round(int(read_stat(SRC_FILES['core_voltage'])) / 1000.0, 2) + # verify card -- is it AMD, do we know the hwmon directory? + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + return round(int(read_stat(path.join(hwmon_dir, "in0_input"))) / 1000.0, 2) -def get_fan_stats() -> dict: +def get_fan_rpm(card: str) -> int: """ - Returns: - dict: A dictionary of current GPU *fan* related statistics. + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` - Example: - `{'fan_rpm': int, 'fan_rpm_target': int}` - """ - return {"fan_rpm": get_fan_rpm(), - "fan_rpm_target": get_fan_target()} - - -def get_fan_rpm() -> int: - """ Returns: int: The current fan RPM """ - return int(read_stat(SRC_FILES['fan_rpm'])) + # verify card -- is it AMD, do we know the hwmon directory? + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + return int(read_stat(path.join(hwmon_dir, "fan1_input"))) -def get_fan_target() -> int: +def get_fan_target(card: str) -> int: """ + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + Returns: int: The current fan RPM """ - return int(read_stat(SRC_FILES['fan_rpm_target'])) + # verify card -- is it AMD, do we know the hwmon directory? + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + return int(read_stat(path.join(hwmon_dir, "fan1_target"))) -def get_gpu_usage() -> int: +def get_gpu_usage(card: str) -> int: """ + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + Returns: int: The current GPU usage/utilization as a percentage """ - return int(read_stat(SRC_FILES['busy_pct'])) + if card in AMDGPU_CARDS: + stat_file = path.join("/sys/class/drm/", card, "device/gpu_busy_percent") + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + return int(read_stat(stat_file)) -def get_temp_stats() -> dict: +def get_temp_stats(card: str) -> dict: """ + Args: + card (str): Card identifier from `/dev/dri/`, ie: `card0`. See `AMDGPU_CARDS` or `find_cards()` + Returns: dict: A dictionary of current GPU *temperature* related statistics. @@ -243,8 +254,28 @@ def get_temp_stats() -> dict: Returned values are converted to C, as integers for simple comparison """ + if card in AMDGPU_CARDS: + hwmon_dir = AMDGPU_CARDS[card] + else: + if len(AMDGPU_CARDS) > 0: + raise ValueError(f"Invalid card: '{card}'. Must be one of: {list(AMDGPU_CARDS.keys())}") + raise ValueError(f"Invalid card: '{card}', no AMD GPUs or hwmon directories found") + # determine temperature nodes, construct a dict to store them + # interface will iterate over these, creating labels as needed + temp_files = {} + temp_node_labels = glob.glob(path.join(hwmon_dir, "temp*_label")) + for temp_node_label_file in temp_node_labels: + # determine the base node id, eg: temp1 + # construct the path to the file that will label it. ie: edge/junction + temp_node_id = path.basename(temp_node_label_file).split('_')[0] + temp_node_value_file = path.join(hwmon_dir, f"{temp_node_id}_input") + with open(temp_node_label_file, 'r', encoding='utf-8') as _node: + temp_node_name = _node.read().strip() + # add the node name/type and the corresponding temp file to the dict + temp_files[temp_node_name] = temp_node_value_file + temp_update = {} - for temp_node, temp_file in TEMP_FILES.items(): + for temp_node, temp_file in temp_files.items(): # iterate through the discovered temperature nodes # ... updating the dictionary with new stats _temperature = int(int(read_stat(temp_file)) // 1000)