-
Notifications
You must be signed in to change notification settings - Fork 52
[WIP][AQUA] GPU Shape Recommendation #1221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
18a92c4
bd026e7
4461af7
26e08a2
7ce57c8
a17b035
e94b6f1
fbfdb91
ba605ee
300aa17
96f5543
3a431fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,16 @@ def get(self, id: Union[str, List[str]] = None): | |
return self.get_deployment_config( | ||
model_id=id.split(",") if "," in id else id | ||
) | ||
elif paths.startswith("aqua/deployments/recommend_shapes"): | ||
id = id or self.get_argument("model_id", default=None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this? Looks like in case of the |
||
if not id or not isinstance(id, str): | ||
raise HTTPError( | ||
400, | ||
f"Invalid request format for {self.request.path}. " | ||
"Expected a single model OCID", | ||
) | ||
id = id.replace(" ", "") | ||
return self.get_recommend_shape(model_id=id) | ||
elif paths.startswith("aqua/deployments/shapes"): | ||
return self.list_shapes() | ||
elif paths.startswith("aqua/deployments"): | ||
|
@@ -161,6 +171,36 @@ def get_deployment_config(self, model_id: Union[str, List[str]]): | |
|
||
return self.finish(deployment_config) | ||
|
||
def get_recommend_shape(self, model_id: str): | ||
""" | ||
Retrieves the valid shape and deployment parameter configuration for one Aqua Model. | ||
|
||
Parameters | ||
---------- | ||
model_id : str | ||
A single model ID (str). | ||
|
||
Returns | ||
------- | ||
None | ||
The function sends the ShapeRecommendReport (generate_table = False) or Rich Diff Table (generate_table = True) | ||
""" | ||
app = AquaDeploymentApp() | ||
|
||
compartment_id = self.get_argument("compartment_id", default=COMPARTMENT_OCID) | ||
|
||
generate_table = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this related to a different output format? If so, the handler should always return a JSON response regardless of the format. |
||
self.get_argument("generate_table", default="True").lower() == "true" | ||
) | ||
|
||
recommend_report = app.recommend_shape( | ||
model_id=model_id, | ||
compartment_id=compartment_id, | ||
generate_table=generate_table, | ||
) | ||
|
||
return self.finish(recommend_report) | ||
|
||
def list_shapes(self): | ||
""" | ||
Lists the valid model deployment shapes. | ||
|
@@ -408,6 +448,7 @@ def get(self, model_deployment_id): | |
("deployments/?([^/]*)/params", AquaDeploymentParamsHandler), | ||
("deployments/config/?([^/]*)", AquaDeploymentHandler), | ||
("deployments/shapes/?([^/]*)", AquaDeploymentHandler), | ||
("deployments/recommend_shapes/?([^/]*)", AquaDeploymentHandler), | ||
("deployments/?([^/]*)", AquaDeploymentHandler), | ||
("deployments/?([^/]*)/activate", AquaDeploymentHandler), | ||
("deployments/?([^/]*)/deactivate", AquaDeploymentHandler), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,3 +11,5 @@ | |
|
||
DEFAULT_WAIT_TIME = 12000 | ||
DEFAULT_POLL_INTERVAL = 10 | ||
|
||
SHAPE_MAP = {"NVIDIA_GPU": "GPU"} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just in case, here the full list of supported series.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did research this- the issue is that I'm not 100% sure if these map to GPU/CPU types. For now, I will map all to CPU types except for NVIDIA_GPU since the AMD GPU shape (MX300) did not have the AMD_ROME for the shape_series parameter. I also did not see any of these series (except for NVIDIA_GPU) when we queried for GPU only shapes. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,11 +8,12 @@ | |
import shlex | ||
import threading | ||
from datetime import datetime, timedelta | ||
from typing import Dict, List, Optional | ||
from typing import Dict, List, Optional, Union | ||
|
||
from cachetools import TTLCache, cached | ||
from oci.data_science.models import ModelDeploymentShapeSummary | ||
from pydantic import ValidationError | ||
from rich.table import Table | ||
|
||
from ads.aqua.app import AquaApp, logger | ||
from ads.aqua.common.entities import ( | ||
|
@@ -63,14 +64,20 @@ | |
ModelDeploymentConfigSummary, | ||
MultiModelDeploymentConfigLoader, | ||
) | ||
from ads.aqua.modeldeployment.constants import DEFAULT_POLL_INTERVAL, DEFAULT_WAIT_TIME | ||
from ads.aqua.modeldeployment.constants import ( | ||
DEFAULT_POLL_INTERVAL, | ||
DEFAULT_WAIT_TIME, | ||
SHAPE_MAP, | ||
) | ||
from ads.aqua.modeldeployment.entities import ( | ||
AquaDeployment, | ||
AquaDeploymentDetail, | ||
ConfigValidationError, | ||
CreateModelDeploymentDetails, | ||
) | ||
from ads.aqua.modeldeployment.model_group_config import ModelGroupConfig | ||
from ads.aqua.shaperecommend.recommend import AquaShapeRecommend | ||
from ads.aqua.shaperecommend.shape_report import ShapeRecommendationReport | ||
from ads.common.object_storage_details import ObjectStorageDetails | ||
from ads.common.utils import UNKNOWN, get_log_links | ||
from ads.common.work_request import DataScienceWorkRequest | ||
|
@@ -1243,6 +1250,101 @@ def validate_deployment_params( | |
) | ||
return {"valid": True} | ||
|
||
def valid_compute_shapes(self, **kwargs) -> List["ComputeShapeSummary"]: | ||
""" | ||
Returns a filtered list of GPU-only ComputeShapeSummary objects by reading and parsing a JSON file. | ||
|
||
Parameters | ||
---------- | ||
file : str | ||
Path to the JSON file containing shape data. | ||
|
||
Returns | ||
------- | ||
List[ComputeShapeSummary] | ||
List of ComputeShapeSummary objects passing the checks. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If the file cannot be opened, parsed, or the 'shapes' key is missing. | ||
""" | ||
compartment_id = kwargs.pop("compartment_id", COMPARTMENT_OCID) | ||
oci_shapes: list[ModelDeploymentShapeSummary] = self.list_resource( | ||
self.ds_client.list_model_deployment_shapes, | ||
compartment_id=compartment_id, | ||
**kwargs, | ||
) | ||
set_user_shapes = {shape.name: shape for shape in oci_shapes} | ||
|
||
gpu_shapes_metadata = load_gpu_shapes_index().shapes | ||
|
||
valid_shapes = [] | ||
# only loops through GPU shapes, update later to include CPU shapes | ||
for name, spec in gpu_shapes_metadata.items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the future- we would need a list of CPU shapes available (similar to load_gpu_shapes_index()). We need this since we make recommendations based on what shapes are possible, not what shapes are currently available |
||
if name in set_user_shapes: | ||
oci_shape = set_user_shapes.get(name) | ||
|
||
compute_shape = ComputeShapeSummary( | ||
available=True, | ||
core_count=oci_shape.core_count, | ||
memory_in_gbs=oci_shape.memory_in_gbs, | ||
shape_series=SHAPE_MAP.get(oci_shape.shape_series, "GPU"), | ||
name=oci_shape.name, | ||
gpu_specs=spec, | ||
) | ||
else: | ||
compute_shape = ComputeShapeSummary( | ||
available=False, name=name, shape_series="GPU", gpu_specs=spec | ||
) | ||
valid_shapes.append(compute_shape) | ||
|
||
valid_shapes.sort( | ||
key=lambda shape: shape.gpu_specs.gpu_memory_in_gbs, reverse=True | ||
) | ||
return valid_shapes | ||
|
||
def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]: | ||
""" | ||
For the CLI (set generate_table = True), generates the table (in rich diff) with valid | ||
GPU deployment shapes for the provided model and configuration. | ||
|
||
For the API (set generate_table = False), generates the JSON with valid | ||
GPU deployment shapes for the provided model and configuration. | ||
|
||
Validates if recommendations are generated, calls method to construct the rich diff | ||
table with the recommendation data. | ||
|
||
Parameters | ||
---------- | ||
model_ocid : str | ||
OCID of the model to recommend feasible compute shapes. | ||
|
||
Returns | ||
------- | ||
Table (generate_table = True) | ||
A table format for the recommendation report with compatible deployment shapes | ||
or troubleshooting info citing the largest shapes if no shape is suitable. | ||
|
||
ShapeRecommendationReport (generate_table = False) | ||
A recommendation report with compatible deployment shapes, or troubleshooting info | ||
citing the largest shapes if no shape is suitable. | ||
|
||
Raises | ||
------ | ||
AquaValueError | ||
If model type is unsupported by tool (no recommendation report generated) | ||
""" | ||
compartment_id = kwargs.get("compartment_id", COMPARTMENT_OCID) | ||
|
||
kwargs["shapes"] = self.valid_compute_shapes(compartment_id=compartment_id) | ||
|
||
shape_recommend = AquaShapeRecommend() | ||
|
||
shape_recommend_report = shape_recommend.which_shapes(**kwargs) | ||
|
||
return shape_recommend_report | ||
|
||
@telemetry(entry_point="plugin=deployment&action=list_shapes", name="aqua") | ||
@cached(cache=TTLCache(maxsize=1, ttl=timedelta(minutes=5), timer=datetime.now)) | ||
def list_shapes(self, **kwargs) -> List[ComputeShapeSummary]: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NIT:
/recommended_shapes
would be better.