2929from tarfile import TarFile
3030from typing import Any , Dict , List , Optional , Union
3131
32- from pydantic import ValidationError
32+ from pydantic import BaseModel , ValidationError
3333
3434from nodescraper .base .inbandcollectortask import InBandDataCollector
3535from nodescraper .enums import EventCategory , EventPriority , ExecutionStatus , OSFamily
3838from nodescraper .plugins .inband .amdsmi .amdsmidata import (
3939 AmdSmiDataModel ,
4040 AmdSmiListItem ,
41+ AmdSmiMetric ,
4142 AmdSmiStatic ,
4243 AmdSmiVersion ,
44+ BadPages ,
4345 EccState ,
4446 Fw ,
4547 FwListItem ,
6567 StaticVbios ,
6668 StaticVram ,
6769 StaticXgmiPlpd ,
70+ Topo ,
6871 ValueUnit ,
72+ XgmiLinks ,
73+ XgmiMetrics ,
6974)
7075from nodescraper .plugins .inband .amdsmi .collector_args import AmdSmiCollectorArgs
7176from nodescraper .utils import get_exception_traceback
@@ -87,6 +92,11 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, AmdSmiCollectorArgs])
8792 CMD_FIRMWARE = "firmware --json"
8893 CMD_STATIC = "static -g all --json"
8994 CMD_STATIC_GPU = "static -g {gpu_id} --json"
95+ CMD_TOPOLOGY = "topology"
96+ CMD_METRIC = "metric -g all"
97+ CMD_BAD_PAGES = "bad-pages"
98+ CMD_XGMI_METRIC = "xgmi -m"
99+ CMD_XGMI_LINK = "xgmi -l"
90100 CMD_RAS = "ras --cper --folder={folder}"
91101 CMD_RAS_AFID = "ras --afid --cper-file {cper_file}"
92102
@@ -317,10 +327,125 @@ def _normalize(self, val: object, default: str = "unknown", slot_type: bool = Fa
317327 if u == "CEM" :
318328 return "CEM"
319329 return "Unknown"
320-
321330 return s
322331
323- def _get_amdsmi_data (self ) -> Optional [AmdSmiDataModel ]:
332+ def _build_amdsmi_sub_data (
333+ self ,
334+ model_class : type [BaseModel ],
335+ json_data : Optional [Union [dict , list ]],
336+ * ,
337+ model_name : Optional [str ] = None ,
338+ ) -> Optional [Union [list , Any ]]:
339+ """Build list or single instance from amd-smi JSON using a Pydantic model.
340+
341+ Args:
342+ model_class: Pydantic model class (e.g. Topo, BadPages, AmdSmiMetric).
343+ json_data: Raw dict or list from amd-smi --json.
344+ model_name: Optional name for logging (defaults to model_class.__name__).
345+
346+ Returns:
347+ List of model instances, single instance, or None on error.
348+ """
349+ name = model_name or model_class .__name__
350+ if json_data is None :
351+ return None
352+ try :
353+ if isinstance (json_data , list ):
354+ out : List [Any ] = []
355+ for item in json_data :
356+ if not isinstance (item , dict ):
357+ continue
358+ try :
359+ out .append (model_class .model_validate (item ))
360+ except ValidationError as err :
361+ self ._log_event (
362+ category = EventCategory .APPLICATION ,
363+ description = f"Failed to build { name } entry; skipping" ,
364+ data = {
365+ "errors" : err .errors (include_url = False ),
366+ "item_keys" : list (item .keys ()),
367+ },
368+ priority = EventPriority .WARNING ,
369+ )
370+ return out
371+ if isinstance (json_data , dict ):
372+ return model_class .model_validate (json_data )
373+ return None
374+ except ValidationError as err :
375+ self ._log_event (
376+ category = EventCategory .APPLICATION ,
377+ description = f"Failed to build { name } " ,
378+ data = {"errors" : err .errors (include_url = False )},
379+ priority = EventPriority .WARNING ,
380+ )
381+ return None
382+
383+ def get_topology (self ) -> Optional [List [Topo ]]:
384+ """Get topology from amd-smi topology --json."""
385+ ret = self ._run_amd_smi_dict (self .CMD_TOPOLOGY )
386+ if ret is None :
387+ return []
388+ if isinstance (ret , dict ) and "gpu_data" in ret :
389+ ret = ret ["gpu_data" ]
390+ data = ret if isinstance (ret , list ) else [ret ]
391+ built = self ._build_amdsmi_sub_data (Topo , data )
392+ return built if isinstance (built , list ) else ([built ] if built else [])
393+
394+ def get_bad_pages (self ) -> Optional [List [BadPages ]]:
395+ """Get bad pages from amd-smi bad-pages --json."""
396+ ret = self ._run_amd_smi_dict (self .CMD_BAD_PAGES )
397+ if ret is None :
398+ return []
399+ data = ret if isinstance (ret , list ) else [ret ]
400+ built = self ._build_amdsmi_sub_data (BadPages , data )
401+ return built if isinstance (built , list ) else ([built ] if built else [])
402+
403+ def get_metric (self ) -> Optional [List [AmdSmiMetric ]]:
404+ """Get metrics from amd-smi metric -g all --json."""
405+ ret = self ._run_amd_smi_dict (self .CMD_METRIC )
406+ if ret is None :
407+ return []
408+ if isinstance (ret , dict ) and "gpu_data" in ret :
409+ ret = ret ["gpu_data" ]
410+ data = ret if isinstance (ret , list ) else [ret ]
411+ built = self ._build_amdsmi_sub_data (AmdSmiMetric , data )
412+ return built if isinstance (built , list ) else ([built ] if built else [])
413+
414+ def get_xgmi_data (
415+ self ,
416+ ) -> tuple [List [XgmiMetrics ], List [XgmiLinks ]]:
417+ """Get XGMI metric and link data from amd-smi xgmi -m and xgmi -l."""
418+ xgmi_metric_raw = self ._run_amd_smi_dict (self .CMD_XGMI_METRIC )
419+ xgmi_metrics : Optional [List [XgmiMetrics ]] = []
420+ if xgmi_metric_raw is not None :
421+ if isinstance (xgmi_metric_raw , dict ) and "xgmi_metric" in xgmi_metric_raw :
422+ xgmi_metric_raw = xgmi_metric_raw ["xgmi_metric" ]
423+ if isinstance (xgmi_metric_raw , list ) and len (xgmi_metric_raw ) == 1 :
424+ xgmi_metric_raw = xgmi_metric_raw [0 ]
425+ data_m = (
426+ xgmi_metric_raw
427+ if isinstance (xgmi_metric_raw , list )
428+ else ([xgmi_metric_raw ] if isinstance (xgmi_metric_raw , dict ) else [])
429+ )
430+ built_m = self ._build_amdsmi_sub_data (XgmiMetrics , data_m )
431+ xgmi_metrics = built_m if isinstance (built_m , list ) else ([built_m ] if built_m else [])
432+
433+ xgmi_link_raw = self ._run_amd_smi_dict (self .CMD_XGMI_LINK )
434+ xgmi_links : Optional [List [XgmiLinks ]] = []
435+ if isinstance (xgmi_link_raw , dict ) and "link_status" in xgmi_link_raw :
436+ link_list = xgmi_link_raw .get ("link_status" )
437+ if isinstance (link_list , list ):
438+ xgmi_links = self ._build_amdsmi_sub_data (XgmiLinks , link_list )
439+ xgmi_links = xgmi_links if isinstance (xgmi_links , list ) else []
440+ elif isinstance (xgmi_link_raw , list ):
441+ xgmi_links = self ._build_amdsmi_sub_data (XgmiLinks , xgmi_link_raw )
442+ xgmi_links = xgmi_links if isinstance (xgmi_links , list ) else []
443+
444+ return xgmi_metrics or [], xgmi_links or []
445+
446+ def _get_amdsmi_data (
447+ self , args : Optional [AmdSmiCollectorArgs ] = None
448+ ) -> Optional [AmdSmiDataModel ]:
324449 """Fill in information for AmdSmi data model
325450
326451 Returns:
@@ -333,6 +458,10 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
333458 firmware = self .get_firmware ()
334459 gpu_list = self .get_gpu_list ()
335460 statics = self .get_static ()
461+ topology = self .get_topology ()
462+ metric = self .get_metric ()
463+ bad_pages = self .get_bad_pages ()
464+ xgmi_metric , xgmi_link = self .get_xgmi_data ()
336465 cper_data , cper_afids = self .get_cper_data ()
337466 except Exception as e :
338467 self ._log_event (
@@ -353,6 +482,11 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
353482 partition = partition ,
354483 firmware = firmware ,
355484 static = statics ,
485+ topology = topology or [],
486+ metric = metric or [],
487+ bad_pages = bad_pages or [],
488+ xgmi_metric = xgmi_metric or [],
489+ xgmi_link = xgmi_link or [],
356490 cper_data = cper_data ,
357491 cper_afids = cper_afids ,
358492 )
@@ -1348,7 +1482,7 @@ def collect_data(
13481482 self .logger .info ("amd-smi version: %s" , version .version )
13491483 self .logger .info ("ROCm version: %s" , version .rocm_version )
13501484
1351- amd_smi_data = self ._get_amdsmi_data ()
1485+ amd_smi_data = self ._get_amdsmi_data (args )
13521486
13531487 if amd_smi_data is None :
13541488 return self .result , None
0 commit comments