Skip to content

Commit 18e4efc

Browse files
authored
Merge pull request #440 from redhat-performance/development
Development -> Master
2 parents c15b40a + d442227 commit 18e4efc

File tree

3 files changed

+252
-1
lines changed

3 files changed

+252
-1
lines changed

src/badfish/main.py

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1759,7 +1759,8 @@ async def get_processor_details(self):
17591759
processors = []
17601760
if data.get("Members"):
17611761
for member in data["Members"]:
1762-
processors.append(member["@odata.id"])
1762+
if "CPU" in member["@odata.id"]:
1763+
processors.append(member["@odata.id"])
17631764

17641765
proc_details = {}
17651766
for processor in processors:
@@ -1793,6 +1794,83 @@ async def get_processor_details(self):
17931794

17941795
return proc_details
17951796

1797+
async def get_gpu_data(self):
1798+
_url = "%s%s/Processors" % (self.host_uri, self.system_resource)
1799+
_response = await self.get_request(_url)
1800+
1801+
if _response.status == 404:
1802+
raise BadfishException("GPU endpoint not available on host.")
1803+
1804+
try:
1805+
raw = await _response.text("utf-8", "ignore")
1806+
data = json.loads(raw.strip())
1807+
1808+
except (ValueError, AttributeError):
1809+
raise BadfishException("There was something wrong getting GPU data")
1810+
return data
1811+
1812+
async def get_gpu_responses(self, data):
1813+
gpu_responses = []
1814+
gpu_endpoints = []
1815+
try:
1816+
if data.get("Members"):
1817+
for member in data["Members"]:
1818+
if "Video" in member["@odata.id"] or "ProcAccelerator" in member["@odata.id"]:
1819+
gpu_endpoints.append(member["@odata.id"])
1820+
1821+
for gpu in gpu_endpoints:
1822+
gpu_url = "%s%s" % (self.host_uri, gpu)
1823+
gpu_response = await self.get_request(gpu_url)
1824+
gpu_raw = await gpu_response.text("utf-8", "ignore")
1825+
gpu_data = json.loads(gpu_raw.strip())
1826+
gpu_responses.append(gpu_data)
1827+
1828+
except (ValueError, AttributeError): # pragma: no cover
1829+
raise BadfishException("There was something wrong getting host GPU details")
1830+
1831+
return gpu_responses
1832+
1833+
async def get_gpu_summary(self, gpu_responses):
1834+
gpu_summary = {}
1835+
try:
1836+
for gpu_data in gpu_responses:
1837+
1838+
gpu_model = gpu_data["Model"]
1839+
1840+
if not gpu_summary.get(gpu_model):
1841+
gpu_summary[gpu_model] = 1
1842+
else:
1843+
gpu_summary[gpu_model] = gpu_summary[gpu_model] + 1
1844+
1845+
except (ValueError, AttributeError, KeyError):
1846+
raise BadfishException("There was something wrong getting GPU summary values.")
1847+
return gpu_summary
1848+
1849+
async def get_gpu_details(self, gpu_responses):
1850+
try:
1851+
gpu_details = {}
1852+
for gpu_data in gpu_responses:
1853+
1854+
gpu_name = gpu_data.get("Id")
1855+
fields = [
1856+
"Model",
1857+
"Manufacturer",
1858+
"ProcessorType",
1859+
]
1860+
1861+
values = {}
1862+
for field in fields:
1863+
value = gpu_data.get(field)
1864+
if value:
1865+
values[field] = value
1866+
1867+
gpu_details.update({gpu_name: values})
1868+
1869+
except (ValueError, AttributeError): # pragma: no cover
1870+
raise BadfishException("There was something wrong getting host GPU details values.")
1871+
1872+
return gpu_details
1873+
17961874
async def get_memory_summary(self):
17971875
_url = "%s%s" % (self.host_uri, self.system_resource)
17981876
_response = await self.get_request(_url)
@@ -1916,6 +1994,27 @@ async def list_processors(self):
19161994

19171995
return True
19181996

1997+
async def list_gpu(self):
1998+
data = await self.get_gpu_data()
1999+
gpu_responses = await self.get_gpu_responses(data)
2000+
2001+
summary = await self.get_gpu_summary(gpu_responses)
2002+
2003+
self.logger.info("GPU Summary:")
2004+
for _key, _value in summary.items():
2005+
self.logger.info(f" Model: {_key} (Count: {_value})")
2006+
2007+
self.logger.info("Current GPU's on host:")
2008+
2009+
gpu_data = await self.get_gpu_details(gpu_responses)
2010+
2011+
for _gpu, _properties in gpu_data.items():
2012+
self.logger.info(f" {_gpu}:")
2013+
for _key, _value in _properties.items():
2014+
self.logger.info(f" {_key}: {_value}")
2015+
2016+
return True
2017+
19192018
async def list_memory(self):
19202019
data = await self.get_memory_summary()
19212020

@@ -2431,6 +2530,7 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
24312530
check_job = _args["check_job"]
24322531
list_jobs = _args["ls_jobs"]
24332532
list_interfaces = _args["ls_interfaces"]
2533+
list_gpu = _args["ls_gpu"]
24342534
list_processors = _args["ls_processors"]
24352535
list_memory = _args["ls_memory"]
24362536
list_serial = _args["ls_serial"]
@@ -2521,6 +2621,8 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
25212621
await badfish.list_interfaces()
25222622
elif list_processors:
25232623
await badfish.list_processors()
2624+
elif list_gpu:
2625+
await badfish.list_gpu()
25242626
elif list_memory:
25252627
await badfish.list_memory()
25262628
elif list_serial:
@@ -2716,6 +2818,11 @@ def main(argv=None):
27162818
help="List Processor Summary",
27172819
action="store_true",
27182820
)
2821+
parser.add_argument(
2822+
"--ls-gpu",
2823+
help="List GPU's on host",
2824+
action="store_true",
2825+
)
27192826
parser.add_argument(
27202827
"--ls-memory",
27212828
help="List Memory Summary",

tests/config.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,62 @@ def render_device_dict(index, device):
570570
"- INFO - Model: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz\n"
571571
"- ERROR - There was something wrong getting processor details\n"
572572
)
573+
GPU_SUMMARY_RESP = (
574+
'{"GPUSummary":"AMD Instinct MI300X": 2,}'
573575

576+
)
577+
GPU_SUMMARY_RESP_FAULTY = (
578+
'{"GPUSummary":"Unknown: 1"}'
579+
)
580+
GPU_MEMBERS_RESP = (
581+
'{"Members": ['
582+
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.21-1"},'
583+
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.22-1"}]}'
584+
)
585+
GPU_MEMBERS_RESP_FAULTY = (
586+
'{"Members": ['
587+
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.21-1"},'
588+
'{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.22-1"}]}'
589+
)
590+
GPU_DATA_RESP1 = (
591+
'{"Model": "AMD Instinct MI300X",'
592+
'"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
593+
'"ProcessorType": "Accelerator",'
594+
'"Id": "ProcAccelerator.Slot.21-1"}'
595+
)
596+
GPU_DATA_RESP2 = (
597+
'{"Model": "AMD Instinct MI300X",'
598+
'"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
599+
'"ProcessorType": "Accelerator",'
600+
'"Id": "ProcAccelerator.Slot.22-1"}'
601+
)
602+
GPU_DATA_RESP_FAULTY = (
603+
'{"GPU":"" }'
604+
)
605+
RESPONSE_LS_GPU = (
606+
"- INFO - GPU Summary:\n"
607+
"- INFO - Model: AMD Instinct MI300X (Count: 2)\n"
608+
"- INFO - Current GPU's on host:\n"
609+
"- INFO - ProcAccelerator.Slot.21-1:\n"
610+
"- INFO - Model: AMD Instinct MI300X\n"
611+
"- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
612+
"- INFO - ProcessorType: Accelerator\n"
613+
"- INFO - ProcAccelerator.Slot.22-1:\n"
614+
"- INFO - Model: AMD Instinct MI300X\n"
615+
"- INFO - Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
616+
"- INFO - ProcessorType: Accelerator\n"
617+
)
618+
619+
RESPONSE_LS_GPU_SUMMARY_DATA_ERROR = "- ERROR - GPU endpoint not available on host.\n"
620+
RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR = "- ERROR - There was something wrong getting GPU summary values.\n"
621+
RESPONSE_LS_GPU_SUMMARY_BAD_JSON = "- ERROR - There was something wrong getting GPU data\n"
622+
RESPONSE_LS_GPU_DETAILS_NOT_FOUND = "- ERROR - There was something wrong getting host GPU details\n"
623+
RESPONSE_LS_GPU_DETAILS_VALUE_ERROR = (
624+
"- INFO - GPU Summary:\n"
625+
"- INFO - Model: AMD Instinct MI300X OAM\n"
626+
"- INFO - Current GPU's on host:\n"
627+
"- ERROR - There was something wrong getting host GPU detailed values.\n"
628+
)
574629
DELL_REDFISH_ROOT_OEM_RESP = """
575630
{"Oem":
576631
{"Dell":

tests/test_ls_gpu.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import pytest
2+
from unittest.mock import patch
3+
4+
from src.badfish.main import BadfishException
5+
from tests.config import (
6+
INIT_RESP,
7+
GPU_MEMBERS_RESP,
8+
GPU_MEMBERS_RESP_FAULTY,
9+
GPU_DATA_RESP1,
10+
GPU_DATA_RESP2,
11+
GPU_SUMMARY_RESP,
12+
RESPONSE_LS_GPU,
13+
GPU_SUMMARY_RESP_FAULTY,
14+
RESPONSE_LS_GPU_SUMMARY_DATA_ERROR,
15+
RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR,
16+
RESPONSE_LS_GPU_DETAILS_NOT_FOUND,
17+
RESPONSE_LS_GPU_DETAILS_VALUE_ERROR, RESPONSE_LS_GPU_SUMMARY_BAD_JSON, GPU_DATA_RESP_FAULTY,
18+
)
19+
from tests.test_base import TestBase
20+
21+
22+
class TestLsGpu(TestBase):
23+
option_arg = "--ls-gpu"
24+
25+
@patch("aiohttp.ClientSession.delete")
26+
@patch("aiohttp.ClientSession.post")
27+
@patch("aiohttp.ClientSession.get")
28+
def test_ls_gpu(self, mock_get, mock_post, mock_delete):
29+
responses_add = [
30+
GPU_MEMBERS_RESP,
31+
GPU_DATA_RESP1,
32+
GPU_DATA_RESP2
33+
]
34+
responses = INIT_RESP + responses_add
35+
self.set_mock_response(mock_delete,200, "OK")
36+
self.set_mock_response(mock_post,200, "OK")
37+
self.set_mock_response(mock_get, 200, responses)
38+
self.args = [self.option_arg]
39+
_, err = self.badfish_call()
40+
assert err == RESPONSE_LS_GPU
41+
42+
@patch("aiohttp.ClientSession.post")
43+
@patch("aiohttp.ClientSession.get")
44+
def test_ls_gpu_data_not_available(
45+
self, mock_get, mock_post
46+
):
47+
responses_add = [
48+
GPU_SUMMARY_RESP_FAULTY,
49+
]
50+
responses = INIT_RESP + responses_add
51+
self.set_mock_response(mock_post, 200, "OK")
52+
self.set_mock_response(mock_get, [200,200,200,200,200,404], responses)
53+
self.args = [self.option_arg]
54+
_, err = self.badfish_call()
55+
assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR
56+
57+
@patch("aiohttp.ClientSession.post")
58+
@patch("aiohttp.ClientSession.get")
59+
def test_ls_gpu_summary_data_error(
60+
self, mock_get, mock_post
61+
):
62+
responses_add = [
63+
GPU_MEMBERS_RESP,
64+
GPU_DATA_RESP1,
65+
GPU_DATA_RESP_FAULTY,
66+
]
67+
responses = INIT_RESP + responses_add
68+
self.set_mock_response(mock_get, [200,200,200,200,200,404,200,200], responses)
69+
self.set_mock_response(mock_post, 200, "OK")
70+
self.args = [self.option_arg]
71+
_, err = self.badfish_call()
72+
assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR
73+
74+
@patch("aiohttp.ClientSession.delete")
75+
@patch("aiohttp.ClientSession.post")
76+
@patch("aiohttp.ClientSession.get")
77+
def test_ls_gpu_summary_value_error(self, mock_get, mock_post, mock_delete):
78+
responses_add = [
79+
GPU_MEMBERS_RESP,
80+
GPU_DATA_RESP1,
81+
GPU_DATA_RESP_FAULTY,
82+
]
83+
responses = INIT_RESP + responses_add
84+
self.set_mock_response(mock_get, 200, responses)
85+
self.set_mock_response(mock_post, 200, "OK")
86+
self.set_mock_response(mock_delete, 200, "OK")
87+
self.args = [self.option_arg]
88+
_, err = self.badfish_call()
89+
assert err == RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR

0 commit comments

Comments
 (0)