Skip to content

Commit 54ebaec

Browse files
Merge pull request #142 from amd/alex_amdsmi
AmdSmiPlugin: enhancement
2 parents 3ee9c2d + 4d82868 commit 54ebaec

6 files changed

Lines changed: 499 additions & 87 deletions

File tree

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
AmdSmiDataModel,
3636
AmdSmiMetric,
3737
AmdSmiStatic,
38-
AmdSmiTstData,
3938
EccData,
4039
Fw,
4140
Partition,
@@ -47,7 +46,7 @@
4746

4847

4948
class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]):
50-
"""Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics"""
49+
"""Check AMD SMI Application data for PCIe, ECC errors, and CPER data."""
5150

5251
DATA_MODEL = AmdSmiDataModel
5352

@@ -667,6 +666,9 @@ def check_expected_xgmi_link_speed(
667666
)
668667
return
669668

669+
expected_str = ", ".join(str(s) for s in expected_xgmi_speed)
670+
mismatches: list[dict] = []
671+
670672
for xgmi_data in xgmi_metric:
671673
link_metric = xgmi_data.link_metrics
672674
try:
@@ -700,32 +702,26 @@ def check_expected_xgmi_link_speed(
700702
continue
701703

702704
if xgmi_float not in expected_xgmi_speed:
703-
self._log_event(
704-
category=EventCategory.IO,
705-
description="XGMI link speed is not as expected",
706-
priority=EventPriority.ERROR,
707-
data={
705+
mismatches.append(
706+
{
708707
"gpu": xgmi_data.gpu,
709-
"xgmi_bit_rate": xgmi_float,
710-
"expected_xgmi_speed": expected_xgmi_speed,
711-
},
712-
console_log=True,
708+
"actual_gt_s": xgmi_float,
709+
"expected_gt_s": expected_str,
710+
}
713711
)
714712

715-
def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData):
716-
"""Check AMD SMI test results
717-
718-
Args:
719-
amdsmitst_data (AmdSmiTstData): AMD SMI test data
720-
"""
721-
if amdsmitst_data.failed_test_count > 0:
713+
if mismatches:
714+
details = "; ".join(
715+
f"GPU {m['gpu']} {m['actual_gt_s']} GT/s (expected {m['expected_gt_s']})"
716+
for m in mismatches
717+
)
722718
self._log_event(
723-
category=EventCategory.APPLICATION,
724-
description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst",
719+
category=EventCategory.IO,
720+
description=f"XGMI link speed is not as expected: {details}",
725721
priority=EventPriority.ERROR,
726722
data={
727-
"failed_test_count": amdsmitst_data.failed_test_count,
728-
"failed_tests": amdsmitst_data.failed_tests,
723+
"expected_gt_s": expected_str,
724+
"mismatches": mismatches,
729725
},
730726
console_log=True,
731727
)
@@ -815,7 +811,4 @@ def analyze_data(
815811
data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed
816812
)
817813

818-
if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0:
819-
self.check_amdsmitst(data.amdsmitst_data)
820-
821814
return self.result

nodescraper/plugins/inband/amdsmi/amdsmi_collector.py

Lines changed: 138 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from tarfile import TarFile
3030
from typing import Any, Dict, List, Optional, Union
3131

32-
from pydantic import ValidationError
32+
from pydantic import BaseModel, ValidationError
3333

3434
from nodescraper.base.inbandcollectortask import InBandDataCollector
3535
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
@@ -38,8 +38,10 @@
3838
from nodescraper.plugins.inband.amdsmi.amdsmidata import (
3939
AmdSmiDataModel,
4040
AmdSmiListItem,
41+
AmdSmiMetric,
4142
AmdSmiStatic,
4243
AmdSmiVersion,
44+
BadPages,
4345
EccState,
4446
Fw,
4547
FwListItem,
@@ -65,7 +67,10 @@
6567
StaticVbios,
6668
StaticVram,
6769
StaticXgmiPlpd,
70+
Topo,
6871
ValueUnit,
72+
XgmiLinks,
73+
XgmiMetrics,
6974
)
7075
from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs
7176
from nodescraper.utils import get_exception_traceback
@@ -87,6 +92,11 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, AmdSmiCollectorArgs])
8792
CMD_FIRMWARE = "firmware --json"
8893
CMD_STATIC = "static -g all --json"
8994
CMD_STATIC_GPU = "static -g {gpu_id} --json"
95+
CMD_TOPOLOGY = "topology"
96+
CMD_METRIC = "metric -g all"
97+
CMD_BAD_PAGES = "bad-pages"
98+
CMD_XGMI_METRIC = "xgmi -m"
99+
CMD_XGMI_LINK = "xgmi -l"
90100
CMD_RAS = "ras --cper --folder={folder}"
91101
CMD_RAS_AFID = "ras --afid --cper-file {cper_file}"
92102

@@ -317,10 +327,125 @@ def _normalize(self, val: object, default: str = "unknown", slot_type: bool = Fa
317327
if u == "CEM":
318328
return "CEM"
319329
return "Unknown"
320-
321330
return s
322331

323-
def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
332+
def _build_amdsmi_sub_data(
333+
self,
334+
model_class: type[BaseModel],
335+
json_data: Optional[Union[dict, list]],
336+
*,
337+
model_name: Optional[str] = None,
338+
) -> Optional[Union[list, Any]]:
339+
"""Build list or single instance from amd-smi JSON using a Pydantic model.
340+
341+
Args:
342+
model_class: Pydantic model class (e.g. Topo, BadPages, AmdSmiMetric).
343+
json_data: Raw dict or list from amd-smi --json.
344+
model_name: Optional name for logging (defaults to model_class.__name__).
345+
346+
Returns:
347+
List of model instances, single instance, or None on error.
348+
"""
349+
name = model_name or model_class.__name__
350+
if json_data is None:
351+
return None
352+
try:
353+
if isinstance(json_data, list):
354+
out: List[Any] = []
355+
for item in json_data:
356+
if not isinstance(item, dict):
357+
continue
358+
try:
359+
out.append(model_class.model_validate(item))
360+
except ValidationError as err:
361+
self._log_event(
362+
category=EventCategory.APPLICATION,
363+
description=f"Failed to build {name} entry; skipping",
364+
data={
365+
"errors": err.errors(include_url=False),
366+
"item_keys": list(item.keys()),
367+
},
368+
priority=EventPriority.WARNING,
369+
)
370+
return out
371+
if isinstance(json_data, dict):
372+
return model_class.model_validate(json_data)
373+
return None
374+
except ValidationError as err:
375+
self._log_event(
376+
category=EventCategory.APPLICATION,
377+
description=f"Failed to build {name}",
378+
data={"errors": err.errors(include_url=False)},
379+
priority=EventPriority.WARNING,
380+
)
381+
return None
382+
383+
def get_topology(self) -> Optional[List[Topo]]:
384+
"""Get topology from amd-smi topology --json."""
385+
ret = self._run_amd_smi_dict(self.CMD_TOPOLOGY)
386+
if ret is None:
387+
return []
388+
if isinstance(ret, dict) and "gpu_data" in ret:
389+
ret = ret["gpu_data"]
390+
data = ret if isinstance(ret, list) else [ret]
391+
built = self._build_amdsmi_sub_data(Topo, data)
392+
return built if isinstance(built, list) else ([built] if built else [])
393+
394+
def get_bad_pages(self) -> Optional[List[BadPages]]:
395+
"""Get bad pages from amd-smi bad-pages --json."""
396+
ret = self._run_amd_smi_dict(self.CMD_BAD_PAGES)
397+
if ret is None:
398+
return []
399+
data = ret if isinstance(ret, list) else [ret]
400+
built = self._build_amdsmi_sub_data(BadPages, data)
401+
return built if isinstance(built, list) else ([built] if built else [])
402+
403+
def get_metric(self) -> Optional[List[AmdSmiMetric]]:
404+
"""Get metrics from amd-smi metric -g all --json."""
405+
ret = self._run_amd_smi_dict(self.CMD_METRIC)
406+
if ret is None:
407+
return []
408+
if isinstance(ret, dict) and "gpu_data" in ret:
409+
ret = ret["gpu_data"]
410+
data = ret if isinstance(ret, list) else [ret]
411+
built = self._build_amdsmi_sub_data(AmdSmiMetric, data)
412+
return built if isinstance(built, list) else ([built] if built else [])
413+
414+
def get_xgmi_data(
415+
self,
416+
) -> tuple[List[XgmiMetrics], List[XgmiLinks]]:
417+
"""Get XGMI metric and link data from amd-smi xgmi -m and xgmi -l."""
418+
xgmi_metric_raw = self._run_amd_smi_dict(self.CMD_XGMI_METRIC)
419+
xgmi_metrics: Optional[List[XgmiMetrics]] = []
420+
if xgmi_metric_raw is not None:
421+
if isinstance(xgmi_metric_raw, dict) and "xgmi_metric" in xgmi_metric_raw:
422+
xgmi_metric_raw = xgmi_metric_raw["xgmi_metric"]
423+
if isinstance(xgmi_metric_raw, list) and len(xgmi_metric_raw) == 1:
424+
xgmi_metric_raw = xgmi_metric_raw[0]
425+
data_m = (
426+
xgmi_metric_raw
427+
if isinstance(xgmi_metric_raw, list)
428+
else ([xgmi_metric_raw] if isinstance(xgmi_metric_raw, dict) else [])
429+
)
430+
built_m = self._build_amdsmi_sub_data(XgmiMetrics, data_m)
431+
xgmi_metrics = built_m if isinstance(built_m, list) else ([built_m] if built_m else [])
432+
433+
xgmi_link_raw = self._run_amd_smi_dict(self.CMD_XGMI_LINK)
434+
xgmi_links: Optional[List[XgmiLinks]] = []
435+
if isinstance(xgmi_link_raw, dict) and "link_status" in xgmi_link_raw:
436+
link_list = xgmi_link_raw.get("link_status")
437+
if isinstance(link_list, list):
438+
xgmi_links = self._build_amdsmi_sub_data(XgmiLinks, link_list)
439+
xgmi_links = xgmi_links if isinstance(xgmi_links, list) else []
440+
elif isinstance(xgmi_link_raw, list):
441+
xgmi_links = self._build_amdsmi_sub_data(XgmiLinks, xgmi_link_raw)
442+
xgmi_links = xgmi_links if isinstance(xgmi_links, list) else []
443+
444+
return xgmi_metrics or [], xgmi_links or []
445+
446+
def _get_amdsmi_data(
447+
self, args: Optional[AmdSmiCollectorArgs] = None
448+
) -> Optional[AmdSmiDataModel]:
324449
"""Fill in information for AmdSmi data model
325450
326451
Returns:
@@ -333,6 +458,10 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
333458
firmware = self.get_firmware()
334459
gpu_list = self.get_gpu_list()
335460
statics = self.get_static()
461+
topology = self.get_topology()
462+
metric = self.get_metric()
463+
bad_pages = self.get_bad_pages()
464+
xgmi_metric, xgmi_link = self.get_xgmi_data()
336465
cper_data, cper_afids = self.get_cper_data()
337466
except Exception as e:
338467
self._log_event(
@@ -353,6 +482,11 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
353482
partition=partition,
354483
firmware=firmware,
355484
static=statics,
485+
topology=topology or [],
486+
metric=metric or [],
487+
bad_pages=bad_pages or [],
488+
xgmi_metric=xgmi_metric or [],
489+
xgmi_link=xgmi_link or [],
356490
cper_data=cper_data,
357491
cper_afids=cper_afids,
358492
)
@@ -1348,7 +1482,7 @@ def collect_data(
13481482
self.logger.info("amd-smi version: %s", version.version)
13491483
self.logger.info("ROCm version: %s", version.rocm_version)
13501484

1351-
amd_smi_data = self._get_amdsmi_data()
1485+
amd_smi_data = self._get_amdsmi_data(args)
13521486

13531487
if amd_smi_data is None:
13541488
return self.result, None

nodescraper/plugins/inband/amdsmi/amdsmidata.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,19 @@ class PageData(BaseModel):
471471
value: Optional[int]
472472

473473

474+
def _bad_pages_retired_list(v: object) -> list[PageData]:
475+
"""Coerce 'No bad pages found.' to empty list."""
476+
if v == "No bad pages found.":
477+
return []
478+
return v # type: ignore[return-value]
479+
480+
474481
class BadPages(BaseModel):
475482
gpu: int
476483
retired: list[PageData]
477484

485+
_retired_validator = field_validator("retired", mode="before")(_bad_pages_retired_list)
486+
478487

479488
# Metric Data
480489
class MetricUsage(BaseModel):
@@ -653,6 +662,8 @@ class MetricThrottleVu(BaseModel):
653662
value: Optional[dict[str, list[Union[int, str]]]] = Field(deprecated=True, default=None)
654663
unit: str = Field(deprecated=True, default="")
655664

665+
_value_na = field_validator("value", mode="before")(na_to_none)
666+
656667

657668
class MetricThrottle(AmdSmiBaseModel):
658669
accumulation_counter: Optional[Union[MetricThrottleVu, ValueUnit]] = None
@@ -806,6 +817,7 @@ class LinkStatusTable(Enum):
806817
UP = "U"
807818
DOWN = "D"
808819
DISABLED = "X"
820+
SELF = "SELF"
809821

810822

811823
class BiDirectionalTable(Enum):
@@ -915,17 +927,6 @@ class Topo(BaseModel):
915927
links: list[TopoLink]
916928

917929

918-
class AmdSmiTstData(BaseModel):
919-
"Summary of amdsmitst results, with list and count of passing/skipped/failed tests"
920-
921-
passed_tests: list[str] = Field(default_factory=list)
922-
skipped_tests: list[str] = Field(default_factory=list)
923-
failed_tests: list[str] = Field(default_factory=list)
924-
passed_test_count: int = 0
925-
skipped_test_count: int = 0
926-
failed_test_count: int = 0
927-
928-
929930
class AmdSmiDataModel(DataModel):
930931
"""Data model for amd-smi data.
931932
@@ -955,7 +956,6 @@ class AmdSmiDataModel(DataModel):
955956
xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list)
956957
cper_data: Optional[list[FileModel]] = Field(default_factory=list)
957958
cper_afids: dict[str, int] = Field(default_factory=dict)
958-
amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData)
959959

960960
def get_list(self, gpu: int) -> Optional[AmdSmiListItem]:
961961
"""Get the gpu list item for the given gpu id."""

0 commit comments

Comments
 (0)