Skip to content

Commit 5fb3933

Browse files
kyudsSeungjinYang
andauthored
[UX][k8s] show-gpus for all allowed contexts (#5362)
* preliminary implementation * fix errors * add total table for k8s gpus * resolve comments * Update sky/cli.py Co-authored-by: Seung Jin <seungjin219@gmail.com> --------- Co-authored-by: Seung Jin <seungjin219@gmail.com>
1 parent a1b15de commit 5fb3933

File tree

3 files changed

+152
-71
lines changed

3 files changed

+152
-71
lines changed

sky/cli.py

Lines changed: 87 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
listed in "sky --help". Take care to put logically connected commands close to
2424
each other.
2525
"""
26+
import collections
2627
import copy
2728
import datetime
2829
import functools
@@ -3413,7 +3414,7 @@ def _list_to_str(lst):
34133414

34143415
# TODO(zhwu,romilb): We should move most of these kubernetes related
34153416
# queries into the backend, especially behind the server.
3416-
def _get_kubernetes_realtime_gpu_table(
3417+
def _get_kubernetes_realtime_gpu_tables(
34173418
context: Optional[str] = None,
34183419
name_filter: Optional[str] = None,
34193420
quantity_filter: Optional[int] = None):
@@ -3423,42 +3424,67 @@ def _get_kubernetes_realtime_gpu_table(
34233424
else:
34243425
qty_header = 'REQUESTABLE_QTY_PER_NODE'
34253426
free_header = 'TOTAL_FREE_GPUS'
3426-
realtime_gpu_table = log_utils.create_table(
3427-
['GPU', qty_header, 'TOTAL_GPUS', free_header])
3428-
realtime_gpu_availability_list = sdk.stream_and_get(
3427+
3428+
realtime_gpu_availability_lists = sdk.stream_and_get(
34293429
sdk.realtime_kubernetes_gpu_availability(
34303430
context=context,
34313431
name_filter=name_filter,
34323432
quantity_filter=quantity_filter))
3433-
if not realtime_gpu_availability_list:
3434-
err_msg = 'No GPUs found in Kubernetes cluster. '
3433+
if not realtime_gpu_availability_lists:
3434+
err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
34353435
debug_msg = 'To further debug, run: sky check '
34363436
if name_filter is not None:
34373437
gpu_info_msg = f' {name_filter!r}'
34383438
if quantity_filter is not None:
34393439
gpu_info_msg += (' with requested quantity'
34403440
f' {quantity_filter}')
34413441
err_msg = (f'Resources{gpu_info_msg} not found '
3442-
'in Kubernetes cluster. ')
3442+
'in any allowed Kubernetes cluster. ')
34433443
debug_msg = ('To show available accelerators on kubernetes,'
34443444
' run: sky show-gpus --cloud kubernetes ')
34453445
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
34463446
debug_msg)
34473447
raise ValueError(full_err_msg)
34483448
no_permissions_str = '<no permissions>'
3449-
for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3450-
gpu_availability = models.RealtimeGpuAvailability(
3451-
*realtime_gpu_availability)
3452-
available_qty = (gpu_availability.available
3453-
if gpu_availability.available != -1 else
3454-
no_permissions_str)
3455-
realtime_gpu_table.add_row([
3456-
gpu_availability.gpu,
3457-
_list_to_str(gpu_availability.counts),
3458-
gpu_availability.capacity,
3459-
available_qty,
3460-
])
3461-
return realtime_gpu_table
3449+
realtime_gpu_infos = []
3450+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3451+
lambda: [0, 0])
3452+
3453+
for (ctx, availability_list) in realtime_gpu_availability_lists:
3454+
realtime_gpu_table = log_utils.create_table(
3455+
['GPU', qty_header, 'TOTAL_GPUS', free_header])
3456+
for realtime_gpu_availability in sorted(availability_list):
3457+
gpu_availability = models.RealtimeGpuAvailability(
3458+
*realtime_gpu_availability)
3459+
available_qty = (gpu_availability.available
3460+
if gpu_availability.available != -1 else
3461+
no_permissions_str)
3462+
realtime_gpu_table.add_row([
3463+
gpu_availability.gpu,
3464+
_list_to_str(gpu_availability.counts),
3465+
gpu_availability.capacity,
3466+
available_qty,
3467+
])
3468+
gpu = gpu_availability.gpu
3469+
capacity = gpu_availability.capacity
3470+
# we want total, so skip permission denied.
3471+
available = max(gpu_availability.available, 0)
3472+
if capacity > 0:
3473+
total_gpu_info[gpu][0] += capacity
3474+
total_gpu_info[gpu][1] += available
3475+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
3476+
3477+
# display an aggregated table for all contexts
3478+
# if there are more than one contexts with GPUs
3479+
if len(realtime_gpu_infos) > 1:
3480+
total_realtime_gpu_table = log_utils.create_table(
3481+
['GPU', 'TOTAL_GPUS', free_header])
3482+
for gpu, stats in total_gpu_info.items():
3483+
total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
3484+
else:
3485+
total_realtime_gpu_table = None
3486+
3487+
return realtime_gpu_infos, total_realtime_gpu_table
34623488

34633489
def _format_kubernetes_node_info(context: Optional[str]):
34643490
node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def _format_kubernetes_node_info(context: Optional[str]):
34793505
'Kubernetes per node accelerator availability ')
34803506
if nodes_info.hint:
34813507
k8s_per_node_acc_message += nodes_info.hint
3482-
return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3508+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
34833509
f'{k8s_per_node_acc_message}'
34843510
f'{colorama.Style.RESET_ALL}\n'
34853511
f'{node_table.get_string()}')
@@ -3516,22 +3542,32 @@ def _output() -> Generator[str, None, None]:
35163542
# If --cloud kubernetes is not specified, we want to catch
35173543
# the case where no GPUs are available on the cluster and
35183544
# print the warning at the end.
3519-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3520-
context)
3545+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
35213546
except ValueError as e:
35223547
if not cloud_is_kubernetes:
35233548
# Make it a note if cloud is not kubernetes
35243549
k8s_messages += 'Note: '
35253550
k8s_messages += str(e)
35263551
else:
35273552
print_section_titles = True
3528-
context_str = f'(Context: {context})' if context else ''
3529-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3530-
f'Kubernetes GPUs {context_str}'
3531-
f'{colorama.Style.RESET_ALL}\n')
3532-
yield from k8s_realtime_table.get_string()
3533-
yield '\n\n'
3534-
yield _format_kubernetes_node_info(context)
3553+
3554+
# print total table
3555+
if total_table is not None:
3556+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3557+
'Total Kubernetes GPUs'
3558+
f'{colorama.Style.RESET_ALL}\n')
3559+
yield from total_table.get_string()
3560+
yield '\n-----\n\n'
3561+
3562+
# print individual infos.
3563+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3564+
context_str = f'(Context: {ctx})' if ctx else ''
3565+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3566+
f'Kubernetes GPUs {context_str}'
3567+
f'{colorama.Style.RESET_ALL}\n')
3568+
yield from k8s_realtime_table.get_string()
3569+
yield '\n\n'
3570+
yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
35353571
if kubernetes_autoscaling:
35363572
k8s_messages += (
35373573
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def _output() -> Generator[str, None, None]:
36203656
# Print section title if not showing all and instead a specific
36213657
# accelerator is requested
36223658
print_section_titles = True
3623-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3624-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
36253659
# TODO(romilb): Show filtered per node GPU availability here as well
36263660
try:
3627-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3628-
name_filter=name, quantity_filter=quantity)
3629-
yield from k8s_realtime_table.get_string()
3661+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
3662+
context=region,
3663+
name_filter=name,
3664+
quantity_filter=quantity)
3665+
3666+
# print total table
3667+
if total_table is not None:
3668+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3669+
'Total Kubernetes GPUs'
3670+
f'{colorama.Style.RESET_ALL}\n')
3671+
yield from total_table.get_string()
3672+
yield '\n-----\n\n'
3673+
3674+
# print individual tables
3675+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3676+
context_str = f'(Context: {ctx})' if ctx else ''
3677+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3678+
f'Kubernetes GPUs {context_str}'
3679+
f'{colorama.Style.RESET_ALL}\n')
3680+
yield from k8s_realtime_table.get_string()
3681+
yield '\n\n'
36303682
except ValueError as e:
36313683
# In the case of a specific accelerator, show the error message
36323684
# immediately (e.g., "Resources H100 not found ...")

sky/clouds/service_catalog/kubernetes_catalog.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,16 @@ def _list_accelerators(
261261

262262
accelerators_available = accelerator_count - allocated_qty
263263

264-
# Initialize the entry if it doesn't exist yet
265-
if accelerator_name not in total_accelerators_available:
266-
total_accelerators_available[accelerator_name] = 0
267-
268264
if accelerators_available >= min_quantity_filter:
269265
quantized_availability = min_quantity_filter * (
270266
accelerators_available // min_quantity_filter)
271-
total_accelerators_available[accelerator_name] = (
272-
total_accelerators_available.get(accelerator_name, 0) +
273-
quantized_availability)
267+
if quantized_availability > 0:
268+
# only increment when quantized availability is positive
269+
# to avoid assertion errors checking keyset sizes in
270+
# core.py _realtime_kubernetes_gpu_availability_single
271+
total_accelerators_available[accelerator_name] = (
272+
total_accelerators_available.get(
273+
accelerator_name, 0) + quantized_availability)
274274

275275
result = []
276276

sky/core.py

Lines changed: 58 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,47 +1012,76 @@ def realtime_kubernetes_gpu_availability(
10121012
context: Optional[str] = None,
10131013
name_filter: Optional[str] = None,
10141014
quantity_filter: Optional[int] = None
1015-
) -> List[models.RealtimeGpuAvailability]:
1016-
1017-
counts, capacity, available = service_catalog.list_accelerator_realtime(
1018-
gpus_only=True,
1019-
clouds='kubernetes',
1020-
name_filter=name_filter,
1021-
region_filter=context,
1022-
quantity_filter=quantity_filter,
1023-
case_sensitive=False)
1024-
assert (set(counts.keys()) == set(capacity.keys()) == set(
1025-
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
1026-
f'capacity ({list(capacity.keys())}), '
1027-
f'and available ({list(available.keys())}) '
1028-
'must be same.')
1029-
if len(counts) == 0:
1030-
err_msg = 'No GPUs found in Kubernetes cluster. '
1015+
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
1016+
1017+
if context is None:
1018+
context_list = clouds.Kubernetes.existing_allowed_contexts()
1019+
else:
1020+
context_list = [context]
1021+
1022+
def _realtime_kubernetes_gpu_availability_single(
1023+
context: Optional[str] = None,
1024+
name_filter: Optional[str] = None,
1025+
quantity_filter: Optional[int] = None
1026+
) -> List[models.RealtimeGpuAvailability]:
1027+
counts, capacity, available = service_catalog.list_accelerator_realtime(
1028+
gpus_only=True,
1029+
clouds='kubernetes',
1030+
name_filter=name_filter,
1031+
region_filter=context,
1032+
quantity_filter=quantity_filter,
1033+
case_sensitive=False)
1034+
assert (set(counts.keys()) == set(capacity.keys()) == set(
1035+
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
1036+
f'capacity ({list(capacity.keys())}), '
1037+
f'and available ({list(available.keys())}) '
1038+
'must be the same.')
1039+
realtime_gpu_availability_list: List[
1040+
models.RealtimeGpuAvailability] = []
1041+
1042+
for gpu, _ in sorted(counts.items()):
1043+
realtime_gpu_availability_list.append(
1044+
models.RealtimeGpuAvailability(
1045+
gpu,
1046+
counts.pop(gpu),
1047+
capacity[gpu],
1048+
available[gpu],
1049+
))
1050+
return realtime_gpu_availability_list
1051+
1052+
availability_lists: List[Tuple[str,
1053+
List[models.RealtimeGpuAvailability]]] = []
1054+
cumulative_count = 0
1055+
parallel_queried = subprocess_utils.run_in_parallel(
1056+
lambda ctx: _realtime_kubernetes_gpu_availability_single(
1057+
context=ctx,
1058+
name_filter=name_filter,
1059+
quantity_filter=quantity_filter), context_list)
1060+
1061+
for ctx, queried in zip(context_list, parallel_queried):
1062+
cumulative_count += len(queried)
1063+
if len(queried) == 0:
1064+
# don't add gpu results for clusters that don't have any
1065+
logger.debug(f'No gpus found in k8s cluster {ctx}')
1066+
continue
1067+
availability_lists.append((ctx, queried))
1068+
1069+
if cumulative_count == 0:
1070+
err_msg = 'No GPUs found in any Kubernetes clusters. '
10311071
debug_msg = 'To further debug, run: sky check '
10321072
if name_filter is not None:
10331073
gpu_info_msg = f' {name_filter!r}'
10341074
if quantity_filter is not None:
10351075
gpu_info_msg += (' with requested quantity'
10361076
f' {quantity_filter}')
10371077
err_msg = (f'Resources{gpu_info_msg} not found '
1038-
'in Kubernetes cluster. ')
1078+
'in Kubernetes clusters. ')
10391079
debug_msg = ('To show available accelerators on kubernetes,'
10401080
' run: sky show-gpus --cloud kubernetes ')
10411081
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
10421082
debug_msg)
10431083
raise ValueError(full_err_msg)
1044-
1045-
realtime_gpu_availability_list: List[models.RealtimeGpuAvailability] = []
1046-
1047-
for gpu, _ in sorted(counts.items()):
1048-
realtime_gpu_availability_list.append(
1049-
models.RealtimeGpuAvailability(
1050-
gpu,
1051-
counts.pop(gpu),
1052-
capacity[gpu],
1053-
available[gpu],
1054-
))
1055-
return realtime_gpu_availability_list
1084+
return availability_lists
10561085

10571086

10581087
# =================

0 commit comments

Comments
 (0)