Merge pull request #440 from redhat-performance/development

grafuls · web-flow · commit 18e4efc136ec · 2025-05-22T17:52:16.000+02:00
Development -&gt; Master
diff --git a/src/badfish/main.py b/src/badfish/main.py
@@ -1759,7 +1759,8 @@ async def get_processor_details(self):
             processors = []
             if data.get("Members"):
                 for member in data["Members"]:
-                    processors.append(member["@odata.id"])
+                    if "CPU" in member["@odata.id"]:
+                        processors.append(member["@odata.id"])
 
             proc_details = {}
             for processor in processors:
@@ -1793,6 +1794,83 @@ async def get_processor_details(self):
 
         return proc_details
 
+    async def get_gpu_data(self):
+        _url = "%s%s/Processors" % (self.host_uri, self.system_resource)
+        _response = await self.get_request(_url)
+
+        if _response.status == 404:
+            raise BadfishException("GPU endpoint not available on host.")
+
+        try:
+            raw = await _response.text("utf-8", "ignore")
+            data = json.loads(raw.strip())
+
+        except (ValueError, AttributeError):
+            raise BadfishException("There was something wrong getting GPU data")
+        return data
+
+    async def get_gpu_responses(self, data):
+        gpu_responses = []
+        gpu_endpoints = []
+        try:
+            if data.get("Members"):
+                for member in data["Members"]:
+                    if "Video" in member["@odata.id"] or "ProcAccelerator" in member["@odata.id"]:
+                        gpu_endpoints.append(member["@odata.id"])
+
+            for gpu in gpu_endpoints:
+                gpu_url = "%s%s" % (self.host_uri, gpu)
+                gpu_response = await self.get_request(gpu_url)
+                gpu_raw = await gpu_response.text("utf-8", "ignore")
+                gpu_data = json.loads(gpu_raw.strip())
+                gpu_responses.append(gpu_data)
+
+        except (ValueError, AttributeError):  # pragma: no cover
+            raise BadfishException("There was something wrong getting host GPU details")
+
+        return gpu_responses
+
+    async def get_gpu_summary(self, gpu_responses):
+        gpu_summary = {}
+        try:
+            for gpu_data in gpu_responses:
+
+                gpu_model = gpu_data["Model"]
+
+                if not gpu_summary.get(gpu_model):
+                    gpu_summary[gpu_model] = 1
+                else:
+                    gpu_summary[gpu_model] = gpu_summary[gpu_model] + 1
+
+        except (ValueError, AttributeError, KeyError):
+            raise BadfishException("There was something wrong getting GPU summary values.")
+        return gpu_summary
+
+    async def get_gpu_details(self, gpu_responses):
+        try:
+            gpu_details = {}
+            for gpu_data in gpu_responses:
+
+                gpu_name = gpu_data.get("Id")
+                fields = [
+                    "Model",
+                    "Manufacturer",
+                    "ProcessorType",
+                ]
+
+                values = {}
+                for field in fields:
+                    value = gpu_data.get(field)
+                    if value:
+                        values[field] = value
+
+                gpu_details.update({gpu_name: values})
+
+        except (ValueError, AttributeError):  # pragma: no cover
+            raise BadfishException("There was something wrong getting host GPU details values.")
+
+        return gpu_details
+
     async def get_memory_summary(self):
         _url = "%s%s" % (self.host_uri, self.system_resource)
         _response = await self.get_request(_url)
@@ -1916,6 +1994,27 @@ async def list_processors(self):
 
         return True
 
+    async def list_gpu(self):
+        data = await self.get_gpu_data()
+        gpu_responses = await self.get_gpu_responses(data)
+
+        summary = await self.get_gpu_summary(gpu_responses)
+
+        self.logger.info("GPU Summary:")
+        for _key, _value in summary.items():
+            self.logger.info(f"  Model: {_key} (Count: {_value})")
+
+        self.logger.info("Current GPU's on host:")
+
+        gpu_data = await self.get_gpu_details(gpu_responses)
+
+        for _gpu, _properties in gpu_data.items():
+            self.logger.info(f"  {_gpu}:")
+            for _key, _value in _properties.items():
+                self.logger.info(f"    {_key}: {_value}")
+
+        return True
+
     async def list_memory(self):
         data = await self.get_memory_summary()
 
@@ -2431,6 +2530,7 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
     check_job = _args["check_job"]
     list_jobs = _args["ls_jobs"]
     list_interfaces = _args["ls_interfaces"]
+    list_gpu = _args["ls_gpu"]
     list_processors = _args["ls_processors"]
     list_memory = _args["ls_memory"]
     list_serial = _args["ls_serial"]
@@ -2521,6 +2621,8 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
             await badfish.list_interfaces()
         elif list_processors:
             await badfish.list_processors()
+        elif list_gpu:
+            await badfish.list_gpu()
         elif list_memory:
             await badfish.list_memory()
         elif list_serial:
@@ -2716,6 +2818,11 @@ def main(argv=None):
         help="List Processor Summary",
         action="store_true",
     )
+    parser.add_argument(
+        "--ls-gpu",
+        help="List GPU's on host",
+        action="store_true",
+    )
     parser.add_argument(
         "--ls-memory",
         help="List Memory Summary",
diff --git a/tests/config.py b/tests/config.py
@@ -570,7 +570,62 @@ def render_device_dict(index, device):
     "- INFO     -     Model: Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz\n"
     "- ERROR    - There was something wrong getting processor details\n"
 )
+GPU_SUMMARY_RESP = (
+    '{"GPUSummary":"AMD Instinct MI300X": 2,}'
 
+)
+GPU_SUMMARY_RESP_FAULTY = (
+    '{"GPUSummary":"Unknown: 1"}'
+)
+GPU_MEMBERS_RESP = (
+    '{"Members": ['
+    '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.21-1"},'
+    '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/ProcAccelerator.Slot.22-1"}]}'
+)
+GPU_MEMBERS_RESP_FAULTY = (
+    '{"Members": ['
+    '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.21-1"},'
+    '{"@odata.id": "/redfish/v1/Systems/System.Embedded.1/Processors/GPU.Slot.22-1"}]}'
+)
+GPU_DATA_RESP1 = (
+    '{"Model": "AMD Instinct MI300X",'
+    '"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
+    '"ProcessorType": "Accelerator",'
+    '"Id": "ProcAccelerator.Slot.21-1"}'
+)
+GPU_DATA_RESP2 = (
+    '{"Model": "AMD Instinct MI300X",'
+    '"Manufacturer": "Advanced Micro Devices, Inc. [AMD/ATI]",'
+    '"ProcessorType": "Accelerator",'
+    '"Id": "ProcAccelerator.Slot.22-1"}'
+)
+GPU_DATA_RESP_FAULTY = (
+    '{"GPU":"" }'
+)
+RESPONSE_LS_GPU = (
+    "- INFO     - GPU Summary:\n"
+    "- INFO     -   Model: AMD Instinct MI300X (Count: 2)\n"
+    "- INFO     - Current GPU's on host:\n"
+    "- INFO     -   ProcAccelerator.Slot.21-1:\n"
+    "- INFO     -     Model: AMD Instinct MI300X\n"
+    "- INFO     -     Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
+    "- INFO     -     ProcessorType: Accelerator\n"
+    "- INFO     -   ProcAccelerator.Slot.22-1:\n"
+    "- INFO     -     Model: AMD Instinct MI300X\n"
+    "- INFO     -     Manufacturer: Advanced Micro Devices, Inc. [AMD/ATI]\n"
+    "- INFO     -     ProcessorType: Accelerator\n"
+)
+
+RESPONSE_LS_GPU_SUMMARY_DATA_ERROR = "- ERROR    - GPU endpoint not available on host.\n"
+RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR = "- ERROR    - There was something wrong getting GPU summary values.\n"
+RESPONSE_LS_GPU_SUMMARY_BAD_JSON = "- ERROR    - There was something wrong getting GPU data\n"
+RESPONSE_LS_GPU_DETAILS_NOT_FOUND = "- ERROR    - There was something wrong getting host GPU details\n"
+RESPONSE_LS_GPU_DETAILS_VALUE_ERROR = (
+    "- INFO     - GPU Summary:\n"
+    "- INFO     -     Model: AMD Instinct MI300X OAM\n"
+    "- INFO     - Current GPU's on host:\n"
+    "- ERROR    - There was something wrong getting host GPU detailed values.\n"
+)
 DELL_REDFISH_ROOT_OEM_RESP = """
     {"Oem":
         {"Dell":
diff --git a/tests/test_ls_gpu.py b/tests/test_ls_gpu.py
@@ -0,0 +1,89 @@
+import pytest
+from unittest.mock import patch
+
+from src.badfish.main import BadfishException
+from tests.config import (
+    INIT_RESP,
+    GPU_MEMBERS_RESP,
+    GPU_MEMBERS_RESP_FAULTY,
+    GPU_DATA_RESP1,
+    GPU_DATA_RESP2,
+    GPU_SUMMARY_RESP,
+    RESPONSE_LS_GPU,
+    GPU_SUMMARY_RESP_FAULTY,
+    RESPONSE_LS_GPU_SUMMARY_DATA_ERROR,
+    RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR,
+    RESPONSE_LS_GPU_DETAILS_NOT_FOUND,
+    RESPONSE_LS_GPU_DETAILS_VALUE_ERROR, RESPONSE_LS_GPU_SUMMARY_BAD_JSON, GPU_DATA_RESP_FAULTY,
+)
+from tests.test_base import TestBase
+
+
+class TestLsGpu(TestBase):
+    option_arg = "--ls-gpu"
+
+    @patch("aiohttp.ClientSession.delete")
+    @patch("aiohttp.ClientSession.post")
+    @patch("aiohttp.ClientSession.get")
+    def test_ls_gpu(self, mock_get, mock_post, mock_delete):
+        responses_add = [
+            GPU_MEMBERS_RESP,
+            GPU_DATA_RESP1,
+            GPU_DATA_RESP2
+        ]
+        responses = INIT_RESP + responses_add
+        self.set_mock_response(mock_delete,200, "OK")
+        self.set_mock_response(mock_post,200, "OK")
+        self.set_mock_response(mock_get, 200, responses)
+        self.args = [self.option_arg]
+        _, err = self.badfish_call()
+        assert err == RESPONSE_LS_GPU
+
+    @patch("aiohttp.ClientSession.post")
+    @patch("aiohttp.ClientSession.get")
+    def test_ls_gpu_data_not_available(
+        self, mock_get, mock_post
+    ):
+        responses_add = [
+            GPU_SUMMARY_RESP_FAULTY,
+        ]
+        responses = INIT_RESP + responses_add
+        self.set_mock_response(mock_post, 200, "OK")
+        self.set_mock_response(mock_get, [200,200,200,200,200,404], responses)
+        self.args = [self.option_arg]
+        _, err = self.badfish_call()
+        assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR
+
+    @patch("aiohttp.ClientSession.post")
+    @patch("aiohttp.ClientSession.get")
+    def test_ls_gpu_summary_data_error(
+        self, mock_get, mock_post
+    ):
+        responses_add = [
+            GPU_MEMBERS_RESP,
+            GPU_DATA_RESP1,
+            GPU_DATA_RESP_FAULTY,
+        ]
+        responses = INIT_RESP + responses_add
+        self.set_mock_response(mock_get, [200,200,200,200,200,404,200,200], responses)
+        self.set_mock_response(mock_post, 200, "OK")
+        self.args = [self.option_arg]
+        _, err = self.badfish_call()
+        assert err == RESPONSE_LS_GPU_SUMMARY_DATA_ERROR
+
+    @patch("aiohttp.ClientSession.delete")
+    @patch("aiohttp.ClientSession.post")
+    @patch("aiohttp.ClientSession.get")
+    def test_ls_gpu_summary_value_error(self, mock_get, mock_post, mock_delete):
+        responses_add = [
+            GPU_MEMBERS_RESP,
+            GPU_DATA_RESP1,
+            GPU_DATA_RESP_FAULTY,
+        ]
+        responses = INIT_RESP + responses_add
+        self.set_mock_response(mock_get, 200, responses)
+        self.set_mock_response(mock_post, 200, "OK")
+        self.set_mock_response(mock_delete, 200, "OK")
+        self.args = [self.option_arg]
+        _, err = self.badfish_call()
+        assert err == RESPONSE_LS_GPU_SUMMARY_VALUE_ERROR