@@ -1759,7 +1759,8 @@ async def get_processor_details(self):
1759
1759
processors = []
1760
1760
if data .get ("Members" ):
1761
1761
for member in data ["Members" ]:
1762
- processors .append (member ["@odata.id" ])
1762
+ if "CPU" in member ["@odata.id" ]:
1763
+ processors .append (member ["@odata.id" ])
1763
1764
1764
1765
proc_details = {}
1765
1766
for processor in processors :
@@ -1793,6 +1794,83 @@ async def get_processor_details(self):
1793
1794
1794
1795
return proc_details
1795
1796
1797
+ async def get_gpu_data (self ):
1798
+ _url = "%s%s/Processors" % (self .host_uri , self .system_resource )
1799
+ _response = await self .get_request (_url )
1800
+
1801
+ if _response .status == 404 :
1802
+ raise BadfishException ("GPU endpoint not available on host." )
1803
+
1804
+ try :
1805
+ raw = await _response .text ("utf-8" , "ignore" )
1806
+ data = json .loads (raw .strip ())
1807
+
1808
+ except (ValueError , AttributeError ):
1809
+ raise BadfishException ("There was something wrong getting GPU data" )
1810
+ return data
1811
+
1812
+ async def get_gpu_responses (self , data ):
1813
+ gpu_responses = []
1814
+ gpu_endpoints = []
1815
+ try :
1816
+ if data .get ("Members" ):
1817
+ for member in data ["Members" ]:
1818
+ if "Video" in member ["@odata.id" ] or "ProcAccelerator" in member ["@odata.id" ]:
1819
+ gpu_endpoints .append (member ["@odata.id" ])
1820
+
1821
+ for gpu in gpu_endpoints :
1822
+ gpu_url = "%s%s" % (self .host_uri , gpu )
1823
+ gpu_response = await self .get_request (gpu_url )
1824
+ gpu_raw = await gpu_response .text ("utf-8" , "ignore" )
1825
+ gpu_data = json .loads (gpu_raw .strip ())
1826
+ gpu_responses .append (gpu_data )
1827
+
1828
+ except (ValueError , AttributeError ): # pragma: no cover
1829
+ raise BadfishException ("There was something wrong getting host GPU details" )
1830
+
1831
+ return gpu_responses
1832
+
1833
+ async def get_gpu_summary (self , gpu_responses ):
1834
+ gpu_summary = {}
1835
+ try :
1836
+ for gpu_data in gpu_responses :
1837
+
1838
+ gpu_model = gpu_data ["Model" ]
1839
+
1840
+ if not gpu_summary .get (gpu_model ):
1841
+ gpu_summary [gpu_model ] = 1
1842
+ else :
1843
+ gpu_summary [gpu_model ] = gpu_summary [gpu_model ] + 1
1844
+
1845
+ except (ValueError , AttributeError , KeyError ):
1846
+ raise BadfishException ("There was something wrong getting GPU summary values." )
1847
+ return gpu_summary
1848
+
1849
+ async def get_gpu_details (self , gpu_responses ):
1850
+ try :
1851
+ gpu_details = {}
1852
+ for gpu_data in gpu_responses :
1853
+
1854
+ gpu_name = gpu_data .get ("Id" )
1855
+ fields = [
1856
+ "Model" ,
1857
+ "Manufacturer" ,
1858
+ "ProcessorType" ,
1859
+ ]
1860
+
1861
+ values = {}
1862
+ for field in fields :
1863
+ value = gpu_data .get (field )
1864
+ if value :
1865
+ values [field ] = value
1866
+
1867
+ gpu_details .update ({gpu_name : values })
1868
+
1869
+ except (ValueError , AttributeError ): # pragma: no cover
1870
+ raise BadfishException ("There was something wrong getting host GPU details values." )
1871
+
1872
+ return gpu_details
1873
+
1796
1874
async def get_memory_summary (self ):
1797
1875
_url = "%s%s" % (self .host_uri , self .system_resource )
1798
1876
_response = await self .get_request (_url )
@@ -1916,6 +1994,27 @@ async def list_processors(self):
1916
1994
1917
1995
return True
1918
1996
1997
+ async def list_gpu (self ):
1998
+ data = await self .get_gpu_data ()
1999
+ gpu_responses = await self .get_gpu_responses (data )
2000
+
2001
+ summary = await self .get_gpu_summary (gpu_responses )
2002
+
2003
+ self .logger .info ("GPU Summary:" )
2004
+ for _key , _value in summary .items ():
2005
+ self .logger .info (f" Model: { _key } (Count: { _value } )" )
2006
+
2007
+ self .logger .info ("Current GPU's on host:" )
2008
+
2009
+ gpu_data = await self .get_gpu_details (gpu_responses )
2010
+
2011
+ for _gpu , _properties in gpu_data .items ():
2012
+ self .logger .info (f" { _gpu } :" )
2013
+ for _key , _value in _properties .items ():
2014
+ self .logger .info (f" { _key } : { _value } " )
2015
+
2016
+ return True
2017
+
1919
2018
async def list_memory (self ):
1920
2019
data = await self .get_memory_summary ()
1921
2020
@@ -2431,6 +2530,7 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
2431
2530
check_job = _args ["check_job" ]
2432
2531
list_jobs = _args ["ls_jobs" ]
2433
2532
list_interfaces = _args ["ls_interfaces" ]
2533
+ list_gpu = _args ["ls_gpu" ]
2434
2534
list_processors = _args ["ls_processors" ]
2435
2535
list_memory = _args ["ls_memory" ]
2436
2536
list_serial = _args ["ls_serial" ]
@@ -2521,6 +2621,8 @@ async def execute_badfish(_host, _args, logger, format_handler=None):
2521
2621
await badfish .list_interfaces ()
2522
2622
elif list_processors :
2523
2623
await badfish .list_processors ()
2624
+ elif list_gpu :
2625
+ await badfish .list_gpu ()
2524
2626
elif list_memory :
2525
2627
await badfish .list_memory ()
2526
2628
elif list_serial :
@@ -2716,6 +2818,11 @@ def main(argv=None):
2716
2818
help = "List Processor Summary" ,
2717
2819
action = "store_true" ,
2718
2820
)
2821
+ parser .add_argument (
2822
+ "--ls-gpu" ,
2823
+ help = "List GPU's on host" ,
2824
+ action = "store_true" ,
2825
+ )
2719
2826
parser .add_argument (
2720
2827
"--ls-memory" ,
2721
2828
help = "List Memory Summary" ,
0 commit comments