23
23
listed in "sky --help". Take care to put logically connected commands close to
24
24
each other.
25
25
"""
26
+ import collections
26
27
import copy
27
28
import datetime
28
29
import functools
@@ -3413,7 +3414,7 @@ def _list_to_str(lst):
3413
3414
3414
3415
# TODO(zhwu,romilb): We should move most of these kubernetes related
3415
3416
# queries into the backend, especially behind the server.
3416
- def _get_kubernetes_realtime_gpu_table (
3417
+ def _get_kubernetes_realtime_gpu_tables (
3417
3418
context : Optional [str ] = None ,
3418
3419
name_filter : Optional [str ] = None ,
3419
3420
quantity_filter : Optional [int ] = None ):
@@ -3423,42 +3424,67 @@ def _get_kubernetes_realtime_gpu_table(
3423
3424
else :
3424
3425
qty_header = 'REQUESTABLE_QTY_PER_NODE'
3425
3426
free_header = 'TOTAL_FREE_GPUS'
3426
- realtime_gpu_table = log_utils .create_table (
3427
- ['GPU' , qty_header , 'TOTAL_GPUS' , free_header ])
3428
- realtime_gpu_availability_list = sdk .stream_and_get (
3427
+
3428
+ realtime_gpu_availability_lists = sdk .stream_and_get (
3429
3429
sdk .realtime_kubernetes_gpu_availability (
3430
3430
context = context ,
3431
3431
name_filter = name_filter ,
3432
3432
quantity_filter = quantity_filter ))
3433
- if not realtime_gpu_availability_list :
3434
- err_msg = 'No GPUs found in Kubernetes cluster. '
3433
+ if not realtime_gpu_availability_lists :
3434
+ err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3435
3435
debug_msg = 'To further debug, run: sky check '
3436
3436
if name_filter is not None :
3437
3437
gpu_info_msg = f' { name_filter !r} '
3438
3438
if quantity_filter is not None :
3439
3439
gpu_info_msg += (' with requested quantity'
3440
3440
f' { quantity_filter } ' )
3441
3441
err_msg = (f'Resources{ gpu_info_msg } not found '
3442
- 'in Kubernetes cluster. ' )
3442
+ 'in any allowed Kubernetes cluster. ' )
3443
3443
debug_msg = ('To show available accelerators on kubernetes,'
3444
3444
' run: sky show-gpus --cloud kubernetes ' )
3445
3445
full_err_msg = (err_msg + kubernetes_constants .NO_GPU_HELP_MESSAGE +
3446
3446
debug_msg )
3447
3447
raise ValueError (full_err_msg )
3448
3448
no_permissions_str = '<no permissions>'
3449
- for realtime_gpu_availability in sorted (realtime_gpu_availability_list ):
3450
- gpu_availability = models .RealtimeGpuAvailability (
3451
- * realtime_gpu_availability )
3452
- available_qty = (gpu_availability .available
3453
- if gpu_availability .available != - 1 else
3454
- no_permissions_str )
3455
- realtime_gpu_table .add_row ([
3456
- gpu_availability .gpu ,
3457
- _list_to_str (gpu_availability .counts ),
3458
- gpu_availability .capacity ,
3459
- available_qty ,
3460
- ])
3461
- return realtime_gpu_table
3449
+ realtime_gpu_infos = []
3450
+ total_gpu_info : Dict [str , List [int ]] = collections .defaultdict (
3451
+ lambda : [0 , 0 ])
3452
+
3453
+ for (ctx , availability_list ) in realtime_gpu_availability_lists :
3454
+ realtime_gpu_table = log_utils .create_table (
3455
+ ['GPU' , qty_header , 'TOTAL_GPUS' , free_header ])
3456
+ for realtime_gpu_availability in sorted (availability_list ):
3457
+ gpu_availability = models .RealtimeGpuAvailability (
3458
+ * realtime_gpu_availability )
3459
+ available_qty = (gpu_availability .available
3460
+ if gpu_availability .available != - 1 else
3461
+ no_permissions_str )
3462
+ realtime_gpu_table .add_row ([
3463
+ gpu_availability .gpu ,
3464
+ _list_to_str (gpu_availability .counts ),
3465
+ gpu_availability .capacity ,
3466
+ available_qty ,
3467
+ ])
3468
+ gpu = gpu_availability .gpu
3469
+ capacity = gpu_availability .capacity
3470
+ # we want total, so skip permission denied.
3471
+ available = max (gpu_availability .available , 0 )
3472
+ if capacity > 0 :
3473
+ total_gpu_info [gpu ][0 ] += capacity
3474
+ total_gpu_info [gpu ][1 ] += available
3475
+ realtime_gpu_infos .append ((ctx , realtime_gpu_table ))
3476
+
3477
+ # display an aggregated table for all contexts
3478
+ # if there are more than one contexts with GPUs
3479
+ if len (realtime_gpu_infos ) > 1 :
3480
+ total_realtime_gpu_table = log_utils .create_table (
3481
+ ['GPU' , 'TOTAL_GPUS' , free_header ])
3482
+ for gpu , stats in total_gpu_info .items ():
3483
+ total_realtime_gpu_table .add_row ([gpu , stats [0 ], stats [1 ]])
3484
+ else :
3485
+ total_realtime_gpu_table = None
3486
+
3487
+ return realtime_gpu_infos , total_realtime_gpu_table
3462
3488
3463
3489
def _format_kubernetes_node_info (context : Optional [str ]):
3464
3490
node_table = log_utils .create_table (
@@ -3479,7 +3505,7 @@ def _format_kubernetes_node_info(context: Optional[str]):
3479
3505
'Kubernetes per node accelerator availability ' )
3480
3506
if nodes_info .hint :
3481
3507
k8s_per_node_acc_message += nodes_info .hint
3482
- return (f'{ colorama .Fore .CYAN } { colorama .Style .BRIGHT } '
3508
+ return (f'{ colorama .Fore .LIGHTMAGENTA_EX } { colorama .Style .NORMAL } '
3483
3509
f'{ k8s_per_node_acc_message } '
3484
3510
f'{ colorama .Style .RESET_ALL } \n '
3485
3511
f'{ node_table .get_string ()} ' )
@@ -3516,22 +3542,32 @@ def _output() -> Generator[str, None, None]:
3516
3542
# If --cloud kubernetes is not specified, we want to catch
3517
3543
# the case where no GPUs are available on the cluster and
3518
3544
# print the warning at the end.
3519
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table (
3520
- context )
3545
+ k8s_realtime_infos , total_table = _get_kubernetes_realtime_gpu_tables (context ) # pylint: disable=line-too-long
3521
3546
except ValueError as e :
3522
3547
if not cloud_is_kubernetes :
3523
3548
# Make it a note if cloud is not kubernetes
3524
3549
k8s_messages += 'Note: '
3525
3550
k8s_messages += str (e )
3526
3551
else :
3527
3552
print_section_titles = True
3528
- context_str = f'(Context: { context } )' if context else ''
3529
- yield (f'{ colorama .Fore .CYAN } { colorama .Style .BRIGHT } '
3530
- f'Kubernetes GPUs { context_str } '
3531
- f'{ colorama .Style .RESET_ALL } \n ' )
3532
- yield from k8s_realtime_table .get_string ()
3533
- yield '\n \n '
3534
- yield _format_kubernetes_node_info (context )
3553
+
3554
+ # print total table
3555
+ if total_table is not None :
3556
+ yield (f'{ colorama .Fore .GREEN } { colorama .Style .BRIGHT } '
3557
+ 'Total Kubernetes GPUs'
3558
+ f'{ colorama .Style .RESET_ALL } \n ' )
3559
+ yield from total_table .get_string ()
3560
+ yield '\n -----\n \n '
3561
+
3562
+ # print individual infos.
3563
+ for (ctx , k8s_realtime_table ) in k8s_realtime_infos :
3564
+ context_str = f'(Context: { ctx } )' if ctx else ''
3565
+ yield (f'{ colorama .Fore .CYAN } { colorama .Style .BRIGHT } '
3566
+ f'Kubernetes GPUs { context_str } '
3567
+ f'{ colorama .Style .RESET_ALL } \n ' )
3568
+ yield from k8s_realtime_table .get_string ()
3569
+ yield '\n \n '
3570
+ yield _format_kubernetes_node_info (ctx ) + '\n -----\n \n '
3535
3571
if kubernetes_autoscaling :
3536
3572
k8s_messages += (
3537
3573
'\n ' + kubernetes_utils .KUBERNETES_AUTOSCALER_NOTE )
@@ -3620,13 +3656,29 @@ def _output() -> Generator[str, None, None]:
3620
3656
# Print section title if not showing all and instead a specific
3621
3657
# accelerator is requested
3622
3658
print_section_titles = True
3623
- yield (f'{ colorama .Fore .CYAN } { colorama .Style .BRIGHT } '
3624
- f'Kubernetes GPUs{ colorama .Style .RESET_ALL } \n ' )
3625
3659
# TODO(romilb): Show filtered per node GPU availability here as well
3626
3660
try :
3627
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table (
3628
- name_filter = name , quantity_filter = quantity )
3629
- yield from k8s_realtime_table .get_string ()
3661
+ k8s_realtime_infos , total_table = _get_kubernetes_realtime_gpu_tables ( # pylint: disable=line-too-long
3662
+ context = region ,
3663
+ name_filter = name ,
3664
+ quantity_filter = quantity )
3665
+
3666
+ # print total table
3667
+ if total_table is not None :
3668
+ yield (f'{ colorama .Fore .GREEN } { colorama .Style .BRIGHT } '
3669
+ 'Total Kubernetes GPUs'
3670
+ f'{ colorama .Style .RESET_ALL } \n ' )
3671
+ yield from total_table .get_string ()
3672
+ yield '\n -----\n \n '
3673
+
3674
+ # print individual tables
3675
+ for (ctx , k8s_realtime_table ) in k8s_realtime_infos :
3676
+ context_str = f'(Context: { ctx } )' if ctx else ''
3677
+ yield (f'{ colorama .Fore .CYAN } { colorama .Style .BRIGHT } '
3678
+ f'Kubernetes GPUs { context_str } '
3679
+ f'{ colorama .Style .RESET_ALL } \n ' )
3680
+ yield from k8s_realtime_table .get_string ()
3681
+ yield '\n \n '
3630
3682
except ValueError as e :
3631
3683
# In the case of a specific accelerator, show the error message
3632
3684
# immediately (e.g., "Resources H100 not found ...")
0 commit comments