From 1a5610c17ecba5cd159b191a5772ae306481fd55 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 13 Oct 2025 14:45:15 -0700 Subject: [PATCH 01/33] rename .infra to infra and clean up imports --- {.infra => infra}/__init__.py | 0 {.infra => infra}/test_infra/__init__.py | 0 {.infra => infra}/test_infra/ec2/__init__.py | 0 {.infra => infra}/test_infra/ec2/setup.py | 6 +----- {.infra => infra}/test_infra/eks/__init__.py | 0 .../test_infra/eks/multinode_heavy/__init__.py | 0 .../test_infra/eks/multinode_heavy/eks-cluster.yaml | 0 .../test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml | 0 .../test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml | 0 .../eks/multinode_heavy/fsx-storage-class.yaml | 0 .../eks/multinode_heavy/large-model-nodegroup.yaml | 0 {.infra => infra}/test_infra/eks/setup.py | 6 +----- {.infra => infra}/test_infra/entrypoint.py | 9 +++------ {.infra => infra}/test_infra/test_infra_utils.py | 5 +---- {.infra => infra}/test_infra/validators/__init__.py | 0 .../test_infra/validators/base_platform_validator.py | 0 .../test_infra/validators/platform_configs.py | 0 .../test_infra/validators/platform_validator_utils.py | 4 ++-- .../test_infra/validators/platform_validators.py | 4 ++-- test/testrunner.py | 10 +--------- 20 files changed, 11 insertions(+), 33 deletions(-) rename {.infra => infra}/__init__.py (100%) rename {.infra => infra}/test_infra/__init__.py (100%) rename {.infra => infra}/test_infra/ec2/__init__.py (100%) rename {.infra => infra}/test_infra/ec2/setup.py (93%) rename {.infra => infra}/test_infra/eks/__init__.py (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/__init__.py (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/eks-cluster.yaml (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-storage-class.yaml (100%) rename {.infra => infra}/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml (100%) rename {.infra => infra}/test_infra/eks/setup.py (91%) rename {.infra => infra}/test_infra/entrypoint.py (93%) rename {.infra => infra}/test_infra/test_infra_utils.py (96%) rename {.infra => infra}/test_infra/validators/__init__.py (100%) rename {.infra => infra}/test_infra/validators/base_platform_validator.py (100%) rename {.infra => infra}/test_infra/validators/platform_configs.py (100%) rename {.infra => infra}/test_infra/validators/platform_validator_utils.py (74%) rename {.infra => infra}/test_infra/validators/platform_validators.py (87%) diff --git a/.infra/__init__.py b/infra/__init__.py similarity index 100% rename from .infra/__init__.py rename to infra/__init__.py diff --git a/.infra/test_infra/__init__.py b/infra/test_infra/__init__.py similarity index 100% rename from .infra/test_infra/__init__.py rename to infra/test_infra/__init__.py diff --git a/.infra/test_infra/ec2/__init__.py b/infra/test_infra/ec2/__init__.py similarity index 100% rename from .infra/test_infra/ec2/__init__.py rename to infra/test_infra/ec2/__init__.py diff --git a/.infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py similarity index 93% rename from .infra/test_infra/ec2/setup.py rename to infra/test_infra/ec2/setup.py index a3761b2f514b..ea4a0084a54c 100644 --- a/.infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -2,11 +2,7 @@ import sys from invoke.context import Context from codebuild_environment import get_cloned_folder_path - -current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(current_dir) - -from test_infra_utils import create_logger +from infra.test_infra.test_infra_utils import create_logger LOGGER = create_logger(__name__) diff --git a/.infra/test_infra/eks/__init__.py b/infra/test_infra/eks/__init__.py similarity index 100% rename from .infra/test_infra/eks/__init__.py rename to infra/test_infra/eks/__init__.py diff --git a/.infra/test_infra/eks/multinode_heavy/__init__.py b/infra/test_infra/eks/multinode_heavy/__init__.py similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/__init__.py rename to infra/test_infra/eks/multinode_heavy/__init__.py diff --git a/.infra/test_infra/eks/multinode_heavy/eks-cluster.yaml b/infra/test_infra/eks/multinode_heavy/eks-cluster.yaml similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/eks-cluster.yaml rename to infra/test_infra/eks/multinode_heavy/eks-cluster.yaml diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml b/infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml rename to infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml b/infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml rename to infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml b/infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml rename to infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml diff --git a/.infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml b/infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml similarity index 100% rename from .infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml rename to infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml diff --git a/.infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py similarity index 91% rename from .infra/test_infra/eks/setup.py rename to infra/test_infra/eks/setup.py index a6478f7c8b8c..9e287e77ddde 100644 --- a/.infra/test_infra/eks/setup.py +++ b/infra/test_infra/eks/setup.py @@ -2,11 +2,7 @@ import sys from invoke.context import Context from codebuild_environment import get_cloned_folder_path - -current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(current_dir) - -from test_infra_utils import create_logger +from infra.test_infra.test_infra_utils import create_logger LOGGER = create_logger(__name__) diff --git a/.infra/test_infra/entrypoint.py b/infra/test_infra/entrypoint.py similarity index 93% rename from .infra/test_infra/entrypoint.py rename to infra/test_infra/entrypoint.py index 48974ec01b4b..7fe859bc087c 100644 --- a/.infra/test_infra/entrypoint.py +++ b/infra/test_infra/entrypoint.py @@ -4,13 +4,10 @@ from test.test_utils import get_dlc_images from codebuild_environment import get_cloned_folder_path -current_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(current_dir) - -from ec2.setup import EC2Platform -from eks.setup import EKSPlatform -from test_infra_utils import ( +from infra.test_infra.ec2.setup import EC2Platform +from infra.test_infra.eks.setup import EKSPlatform +from infra.test_infra.test_infra_utils import ( create_logger, parse_buildspec, validate_and_filter_tests, diff --git a/.infra/test_infra/test_infra_utils.py b/infra/test_infra/test_infra_utils.py similarity index 96% rename from .infra/test_infra/test_infra_utils.py rename to infra/test_infra/test_infra_utils.py index 9cf6f50d6c76..a98707678797 100644 --- a/.infra/test_infra/test_infra_utils.py +++ b/infra/test_infra/test_infra_utils.py @@ -7,10 +7,7 @@ from src.buildspec import Buildspec from test.test_utils import get_buildspec_path from codebuild_environment import get_cloned_folder_path - -current_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(current_dir) -from validators.platform_validator_utils import get_platform_validator +from infra.test_infra.validators.platform_validator_utils import get_platform_validator def create_logger(name, level=logging.INFO): diff --git a/.infra/test_infra/validators/__init__.py b/infra/test_infra/validators/__init__.py similarity index 100% rename from .infra/test_infra/validators/__init__.py rename to infra/test_infra/validators/__init__.py diff --git a/.infra/test_infra/validators/base_platform_validator.py b/infra/test_infra/validators/base_platform_validator.py similarity index 100% rename from .infra/test_infra/validators/base_platform_validator.py rename to infra/test_infra/validators/base_platform_validator.py diff --git a/.infra/test_infra/validators/platform_configs.py b/infra/test_infra/validators/platform_configs.py similarity index 100% rename from .infra/test_infra/validators/platform_configs.py rename to infra/test_infra/validators/platform_configs.py diff --git a/.infra/test_infra/validators/platform_validator_utils.py b/infra/test_infra/validators/platform_validator_utils.py similarity index 74% rename from .infra/test_infra/validators/platform_validator_utils.py rename to infra/test_infra/validators/platform_validator_utils.py index 092544ed72ea..d1513a008865 100644 --- a/.infra/test_infra/validators/platform_validator_utils.py +++ b/infra/test_infra/validators/platform_validator_utils.py @@ -1,5 +1,5 @@ -from .base_platform_validator import BasePlatformValidator -from .platform_validators import EC2MultiNodeValidator, EKSValidator +from infra.test_infra.validators.base_platform_validator import BasePlatformValidator +from infra.test_infra.validators.platform_validators import EC2MultiNodeValidator, EKSValidator _VALIDATORS = {"ec2-multi-node": EC2MultiNodeValidator, "eks": EKSValidator} diff --git a/.infra/test_infra/validators/platform_validators.py b/infra/test_infra/validators/platform_validators.py similarity index 87% rename from .infra/test_infra/validators/platform_validators.py rename to infra/test_infra/validators/platform_validators.py index 7f3febddab18..a1a9f1a41c6b 100644 --- a/.infra/test_infra/validators/platform_validators.py +++ b/infra/test_infra/validators/platform_validators.py @@ -1,6 +1,6 @@ from typing import List, Dict -from .base_platform_validator import BasePlatformValidator -from .platform_configs import EC2Config, EKSConfig +from infra.test_infra.validators.base_platform_validator import BasePlatformValidator +from infra.test_infra.validators.platform_configs import EC2Config, EKSConfig class EC2MultiNodeValidator(BasePlatformValidator): diff --git a/test/testrunner.py b/test/testrunner.py index 796231486375..9ff18fa19539 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -3,7 +3,6 @@ import sys import logging import re -import importlib.util from multiprocessing import Pool, Manager from datetime import datetime @@ -34,6 +33,7 @@ from test_utils import KEYS_TO_DESTROY_FILE from test_utils.pytest_cache import PytestCache from test.vllm.trigger_test import test as test_vllm +from infra.test_infra.entrypoint import main as run_new_tests from src.codebuild_environment import get_codebuild_project_name @@ -438,14 +438,6 @@ def main(): try: LOGGER.info(f"Running vLLM EKS EC2 tests with image: {all_image_list[0]}") if new_test_structure_enabled: - project_root = os.path.dirname(os.path.dirname(os.getcwd())) - spec = importlib.util.spec_from_file_location( - "entrypoint", - os.path.join(project_root, ".infra", "test_infra", "entrypoint.py"), - ) - entrypoint_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(entrypoint_module) - run_new_tests = entrypoint_module.main LOGGER.info("Using new buildspec-based test system") run_new_tests() else: From 815527322cec7018a2d1942a53b67116aea30d6a Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 12:23:25 -0700 Subject: [PATCH 02/33] copy and modify ec2 infra utils --- infra/test_infra/ec2/utils.py | 2183 +++++++++++++++++++++++++++++++++ 1 file changed, 2183 insertions(+) create mode 100644 infra/test_infra/ec2/utils.py diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py new file mode 100644 index 000000000000..86e3d0b8a3a8 --- /dev/null +++ b/infra/test_infra/ec2/utils.py @@ -0,0 +1,2183 @@ +import os +import time +import re +import logging +import sys +import uuid +import copy + +from random import randint + +from collections import Counter + +from inspect import signature + +import boto3 + +from fabric import Connection +from botocore.config import Config +from botocore.exceptions import ClientError +from invoke import run +from packaging.version import Version +from packaging.specifiers import SpecifierSet +from tenacity import ( + retry, + stop_after_attempt, + stop_after_delay, + wait_fixed, + wait_random_exponential, +) + +from test.test_utils import ( + get_synapseai_version_from_tag, + is_deep_canary_context, + is_pr_context, + is_mainline_context, + are_heavy_instance_ec2_tests_enabled, + login_to_ecr_registry, + get_account_id_from_image_uri, + UL_AMI_LIST, +) +from . import DEFAULT_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET +from infra.test_infra.test_infra_utils import create_logger + +EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole" + +# List of instance types for which, if instance spin-up fails, the test is skipped instead of failing. +ICE_SKIP_INSTANCE_LIST = [] + +# List of instance types which are too powerful for minor tests +HEAVY_INSTANCE_LIST = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + +# Flag to enable IPv6 testing +ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + +IPV6_VPC_NAME = os.getenv("IPV6_VPC_NAME") + +LOGGER = create_logger(__name__) + +def filter_only_multi_gpu(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if get_instance_num_gpus(instance_type=instance_type) > 1 + ] + return filtered_list + + +def filter_only_multi_gpu_and_no_g_type(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if get_instance_num_gpus(instance_type=instance_type) > 1 + and not instance_type.startswith("g") + ] + return filtered_list + + +def filter_only_single_gpu(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if get_instance_num_gpus(instance_type=instance_type) == 1 + ] + return filtered_list + + +def filter_no_t32x(instance_type_list): + filtered_list = [ + instance_type for instance_type in instance_type_list if instance_type != "t3.2xlarge" + ] + return filtered_list + + +def is_instance_single_gpu(instance_type): + return get_instance_num_gpus(instance_type=instance_type) == 1 + + +def is_instance_multi_gpu(instance_type): + return get_instance_num_gpus(instance_type=instance_type) > 1 + + +def filter_not_heavy_instance_types(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if instance_type not in HEAVY_INSTANCE_LIST + ] + return filtered_list + + +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html +# both g4dn and g5.24xlarge we use in RC is not RDMA read supported +# performance test will fail if we use g5.24xlarge +def filter_efa_instance_type(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if get_num_efa_interfaces_for_instance_type(instance_type) + and not instance_type.startswith("g4") + and not instance_type.startswith("g5") + ] + return filtered_list + + +def filter_efa_only_p4_instance_type(instance_type_list): + filtered_list = [ + instance_type + for instance_type in instance_type_list + if get_num_efa_interfaces_for_instance_type(instance_type) + and instance_type.startswith("p4") + ] + return filtered_list + + +def get_cicd_instance_reserved_region(instance_type): + return P4DE_REGION if instance_type in ["p4de.24xlarge"] else DEFAULT_REGION + + +def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type=""): + """ + Helper function wrapping around get_ec2_instance_type to parametrize both ec2_instance_type + as well as region in cases where certain instance types are reserved in a particular region. + :param default: Default instance type to use + :param filter_function: filter_function(instance_type_list) A function that takes the list to be generated by + the logic of the get_ec2_instance_type function, and filters the list to only produce "acceptable" instances. + For example, this can be a function that only returns multi-gpu instance types from a given list of instance types. + :param job_type: str "training"/"inference"/"" as required by the instance-type being tested + :return: one item list of instance type -- this is used to parametrize tests, and parameter is required to be + a list. + """ + instance_list = get_ec2_instance_type(default, "gpu", filter_function, job_type=job_type) + instance_region_list = [ + (instance_type, get_cicd_instance_reserved_region(instance_type)) + for instance_type in instance_list + ] + return instance_region_list + + +def get_ec2_instance_type( + default, processor, filter_function=lambda x: x, arch_type="", job_type="" +): + """ + Get EC2 instance type from associated EC2_[CPU|GPU]_INSTANCE_TYPE env variable, or set it to a + default for contexts where the variable is not present (i.e. PR, Nightly, local testing) + + :param default: Default instance type to use + :param processor: "cpu" or "gpu" + :param filter_function: filter_function(instance_type_list) A function that takes the list to be + generated by the logic of the get_ec2_instance_type function, and filters the list to only + produce "acceptable" instances. For example, this can be a function that only returns multi-gpu + instance types from a given list of instance types. + + :return: one item list of instance type -- this is used to parametrize tests, and parameter is + required to be a list. + """ + if is_pr_context() or is_deep_canary_context(): + # This condition filters out instance types that use resources with low-availability, or + # use very expensive instance types. + if not are_heavy_instance_ec2_tests_enabled() and default in HEAVY_INSTANCE_LIST: + return [] + return [default] + + allowed_processors = ("cpu", "gpu", "neuronx", "neuron", "hpu") + job_type_str = f"_{job_type.upper()}" if job_type else "" + if processor not in allowed_processors: + raise RuntimeError( + f"Aborting EC2 test run. Unrecognized processor type {processor}. " + f"Please choose from {allowed_processors}" + ) + instance_type = os.getenv(f"EC2_{processor.upper()}{job_type_str}_INSTANCE_TYPE") + if arch_type == "graviton" or arch_type == "arm64": + instance_type = os.getenv( + f"EC2_{processor.upper()}_{arch_type.upper()}{job_type_str}_INSTANCE_TYPE" + ) + if not instance_type: + return [] + + instance_list = filter_function([instance_type] if instance_type else []) + return instance_list + + +def get_ec2_accelerator_type(default, processor): + """ + Get EC2 instance type from associated EC2_EIA_INSTANCE_TYPE env variable, or set it to a default + for contexts where the variable is not present (i.e. PR, Nightly, local testing) + + :param default: Default accelerator instance type to use + :param processor: "eia" + + :return: one item list of instance type -- this is used to parametrize tests, and parameter is required to be + a list. + """ + allowed_processors = ("eia",) + if processor not in allowed_processors: + raise RuntimeError( + f"Aborting EC2 test run. Unrecognized processor type {processor}. " + f"Please choose from {allowed_processors}" + ) + accelerator_type = os.getenv(f"EC2_{processor.upper()}_INSTANCE_TYPE") + if not accelerator_type: + if is_mainline_context(): + return [] + return [default] + return [accelerator_type] + + +def launch_instance( + ami_id, + instance_type, + ec2_key_name=None, + region=DEFAULT_REGION, + user_data=None, + iam_instance_profile_name=None, + instance_name="", +): + """ + Launch an instance + :param ami_id: AMI ID to be used for launched instance + :param instance_type: Instance type of launched instance + :param region: Region where instance will be launched + :param user_data: Script to run when instance is launched as a str + :param iam_instance_profile_arn: EC2 Role to be attached + :param instance_name: Tag to display as Name on EC2 Console + :return: Information about the instance that was launched + """ + if not ami_id: + raise Exception("No ami_id provided") + if not ec2_key_name: + raise Exception("Ec2 Key name must be provided") + client = boto3.Session(region_name=region).client("ec2") + LOGGER.info(f"Using AMI ID: {ami_id}") + volume_name = "/dev/sda1" if ami_id in UL_AMI_LIST else "/dev/xvda" + + # Construct the dictionary with the arguments for API call + arguments_dict = { + "KeyName": ec2_key_name, + "ImageId": ami_id, + "InstanceType": instance_type, + "MaxCount": 1, + "MinCount": 1, + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [{"Key": "Name", "Value": f"CI-CD {instance_name}"}], + }, + ], + "MetadataOptions": { + "HttpTokens": "required", + "HttpEndpoint": "enabled", + "HttpPutResponseHopLimit": 2, + }, + "BlockDeviceMappings": [ + { + "DeviceName": volume_name, + "Ebs": { + "VolumeSize": 150, + }, + } + ], + } + if user_data: + arguments_dict["UserData"] = user_data + if iam_instance_profile_name: + arguments_dict["IamInstanceProfile"] = {"Name": iam_instance_profile_name} + + reservations = get_available_reservations( + ec2_client=client, instance_type=instance_type, min_availability=arguments_dict["MinCount"] + ) + + while reservations: + reservation = reservations.pop(0) + arguments_dict["CapacityReservationSpecification"] = { + "CapacityReservationTarget": { + "CapacityReservationId": reservation["CapacityReservationId"] + } + } + try: + response = client.run_instances(**arguments_dict) + LOGGER.info( + f"Your {instance_type} reservation is ready, please wait to be seated. Launching..." + ) + if is_mainline_context(): + LOGGER.info(f"Launched instance via {reservation}") + return response["Instances"][0] + except ClientError as e: + LOGGER.error(f"Failed to launch via {instance_type} reservation - {e}") + # Refresh available reservations + time.sleep(randint(10, 30)) + reservations = get_available_reservations( + ec2_client=client, + instance_type=instance_type, + min_availability=arguments_dict["MinCount"], + ) + + # Clean up cap reservation if we don't find one + arguments_dict.pop("CapacityReservationSpecification", None) + LOGGER.info(f"No capacity reservation available for {instance_type}, trying elsewhere...") + response = client.run_instances(**arguments_dict) + + if not response or len(response["Instances"]) < 1: + raise Exception( + "Unable to launch the instance. \ + Did not return any response" + ) + + return response["Instances"][0] + + +def get_available_reservations(ec2_client, instance_type, min_availability=1): + """ + Get capacity reservations in our region that have our minimum availability + + Args: + ec2_client (boto3.client): EC2 Boto3 client + instance_type (string): instance type, i.e. g5.8xlarge + min_availability (int, optional): Minimum number of instances to launch. Defaults to 1. + + Returns: + list: list of dictionaries of reservations + """ + reservations = ec2_client.describe_capacity_reservations() + + open_tables = [ + reservation + for reservation in reservations["CapacityReservations"] + if reservation["InstanceType"] == instance_type + and reservation["AvailableInstanceCount"] >= min_availability + ] + + # Sort by ascending instance count and total instance count, + # so that we take minimum instances required, and leave other reservations + # open for larger parties + open_tables.sort(key=lambda res: res["TotalInstanceCount"]) + + return sorted(open_tables, key=lambda res: res["AvailableInstanceCount"]) + + +@retry( + reraise=True, + stop=stop_after_delay(30 * 60), # Keep retrying for 30 minutes + wait=wait_random_exponential(min=60, max=5 * 60), # Retry after waiting 1-5 minutes +) +def launch_instances_with_retry( + ec2_resource, ec2_client, availability_zone_options, ec2_create_instances_definition, fn_name="" +): + """ + Helper function to launch EC2 instances with retry capability, to allow multiple attempts + when facing instance capacity issues. + :param ec2_resource: boto3 EC2 Service Resource object + :param ec2_client: boto3 EC2 Client object + :param availability_zone_options: list of availability zones in which to try to run instances + :param ec2_create_instances_definition: dict of parameters to pass to + ec2_resource.create_instances + :param fn_name: string - function name for ease of logging + :return: list of EC2 Instance Resource objects for instances launched + """ + + instances = None + reservations = get_available_reservations( + ec2_client=ec2_client, + instance_type=ec2_create_instances_definition["InstanceType"], + min_availability=ec2_create_instances_definition["MinCount"], + ) + # Look at available CRs first + while reservations: + reservation = reservations.pop(0) + ec2_create_instances_definition["CapacityReservationSpecification"] = { + "CapacityReservationTarget": { + "CapacityReservationId": reservation["CapacityReservationId"] + } + } + try: + instances = ec2_resource.create_instances(**ec2_create_instances_definition) + LOGGER.info( + f"Your reservation is ready for {fn_name}, please wait to be seated. Launching..." + ) + if is_mainline_context(): + LOGGER.info(f"Launched instance for {fn_name} via {reservation}") + return instances + except ClientError as e: + LOGGER.error(f"Failed to launch via reservation for {fn_name} - {e}") + + # Clean up capacity reservation if it failed + ec2_create_instances_definition.pop("CapacityReservationSpecification", None) + + LOGGER.info( + f"Looks like you didn't have a reservation for {fn_name}, let's see if we can seat you as a walk-in..." + ) + + if availability_zone_options: + error = None + for a_zone in availability_zone_options: + ec2_create_instances_definition["Placement"] = {"AvailabilityZone": a_zone} + try: + instances = ec2_resource.create_instances(**ec2_create_instances_definition) + if instances: + break + except ClientError as e: + LOGGER.error(f"Failed to launch in {a_zone} due to {e} for {fn_name}") + error = e + continue + if not instances: + raise error + else: + instances = ec2_resource.create_instances(**ec2_create_instances_definition) + return instances + + +def launch_efa(ec2_client, ec2_instance_type, ec2_run_instances_definition, availability_zone): + ec2_efa_run_instances_definition = copy.deepcopy(ec2_run_instances_definition) + ec2_efa_run_instances_definition.update( + { + "Placement": {"AvailabilityZone": availability_zone}, + "NetworkInterfaces": generate_network_interfaces( + ec2_client, ec2_instance_type, availability_zone + ), + } + ) + response = ec2_client.run_instances(**ec2_efa_run_instances_definition) or {} + return response.get("Instances") + + +def launch_efa_with_reservations( + ec2_client, ec2_instance_type, reservations, ec2_run_instances_definition, fn_name="" +): + ec2_run_instances_reserved_definition = copy.deepcopy(ec2_run_instances_definition) + while reservations: + reservation = reservations.pop(0) + ec2_run_instances_reserved_definition["CapacityReservationSpecification"] = { + "CapacityReservationTarget": { + "CapacityReservationId": reservation["CapacityReservationId"] + } + } + try: + instances = launch_efa( + ec2_client, + ec2_instance_type, + ec2_run_instances_reserved_definition, + reservation["AvailabilityZone"], + ) + if instances: + LOGGER.info( + f"Your EFA reservation is ready for {fn_name}, please wait to be seated. Launching..." + ) + if is_mainline_context(): + LOGGER.info(f"Launched EFA enabled instance for {fn_name} via {reservation}") + return instances + except ClientError as e: + LOGGER.debug( + f"Failed to launch EFA instance for {fn_name} from reservation due to {e}\n" + "Checking additional open reservations..." + ) + return [] + + +def validate_efa_instance_conditions(instances, minimum_number_of_instances): + if len(instances) == minimum_number_of_instances: + return True + if len(instances) > minimum_number_of_instances: + raise RuntimeError( + f"Launched too many instances somehow, raising and cleaning up - {instances}; min/max_allowed = {minimum_number_of_instances}" + ) + return False + + +class HeterogenousReservationError(Exception): + pass + + +def referesh_capacity_reservations(ec2_client, ec2_instance_type, az): + reservations = [ + reservation + for reservation in get_available_reservations(ec2_client, ec2_instance_type) + if reservation["AvailabilityZone"] == az + ] + + available_instances = sum( + [reservation["AvailableInstanceCount"] for reservation in reservations] + ) + + return reservations, available_instances + + +def launch_efa_with_heterogenous_reservations(ec2_client, ec2_run_instances_definition, fn_name=""): + """ + Launch efa instances with heterogenous reservations + + Previous EFA launch code requires instances to be launched from the same command. This prohibits launching instances + from multiple capacity reservations if the reservation has less than the minimum available instances required (typically 2). + + To remedy this, we group reservations by availability zone. If we have instances available in reservation, we + group by most common availability zone and try to launch multiple instances from reservation. If we do not meet our minimum + requirements, try launching from public pool to remedy the situation. If we launch 0 from reservation, do not + try launching from the public pool, and allow other functions to handle launching exclusively from public. + + Args: + ec2_client (boto3.client): boto3 ec2 client + ec2_run_instances_definition (dict): key/value pairs for run instances launch cmd + fn_name (str, optional): pytest function name. Defaults to "". + + Raises: + HeterogenousReservationError: Custom error handling for function failure + + Returns: + list: launched instances + """ + ec2_heterogenous_run_instances_definition = copy.deepcopy(ec2_run_instances_definition) + ec2_instance_type = ec2_heterogenous_run_instances_definition["InstanceType"] + minimum_number_of_instances = ec2_heterogenous_run_instances_definition["MinCount"] + + # Reset max and min count to 1; We will + ec2_heterogenous_run_instances_definition["MaxCount"] = 1 + ec2_heterogenous_run_instances_definition["MinCount"] = 1 + + reserved_azs = [ + reservation["AvailabilityZone"] + for reservation in ec2_client.describe_capacity_reservations()["CapacityReservations"] + if reservation["InstanceType"] == ec2_instance_type + ] + + tmp_reservations = get_available_reservations( + ec2_client=ec2_client, + instance_type=ec2_instance_type, + min_availability=ec2_heterogenous_run_instances_definition["MinCount"], + ) + + az_counter = Counter(reservation["AvailabilityZone"] for reservation in tmp_reservations) + az_priorities = [c[0] for c in az_counter.most_common()] + + # Track all reserved availability zones, in case capacity comes later + for reserved_az in reserved_azs: + if reserved_az not in az_priorities: + az_priorities.append(reserved_az) + + for az in az_priorities: + LOGGER.info(f"Checking AZ {az}") + # Refresh reservations for each AZ + reservations, available_instances = referesh_capacity_reservations( + ec2_client, ec2_instance_type, az + ) + ec2_heterogenous_run_instances_definition["MaxCount"] = 1 + ec2_heterogenous_run_instances_definition["MinCount"] = 1 + instances = [] + try: + while available_instances and len(instances) < minimum_number_of_instances: + LOGGER.info(f"trying to launch {ec2_instance_type} in {az}") + instance = launch_efa_with_reservations( + ec2_client=ec2_client, + ec2_instance_type=ec2_instance_type, + reservations=reservations, + ec2_run_instances_definition=ec2_heterogenous_run_instances_definition, + fn_name=fn_name, + ) + instances += instance + + # Refresh reservations for each AZ + reservations, available_instances = referesh_capacity_reservations( + ec2_client, ec2_instance_type, az + ) + + if validate_efa_instance_conditions(instances, minimum_number_of_instances): + LOGGER.info("Strung together some reservations, let's go") + return instances + + # If we have remaining instances, try launching from public pool + # Try a different availability zone if we don't have any reservation launches, however. Always + # prioritize reservation launches in this function. + remaining_instances = minimum_number_of_instances - len(instances) + if remaining_instances != minimum_number_of_instances: + LOGGER.info( + f"Have {remaining_instances} remaining_instances instances in {az}. Trying from public pool." + ) + ec2_heterogenous_run_instances_definition["MaxCount"] = remaining_instances + ec2_heterogenous_run_instances_definition["MinCount"] = remaining_instances + instances += launch_efa( + ec2_client, ec2_instance_type, ec2_heterogenous_run_instances_definition, az + ) + + if validate_efa_instance_conditions(instances, minimum_number_of_instances): + LOGGER.info("Strung together some reservations and some walk-ins, let's go") + return instances + + # Clean up instances if this workflow did not succeed + LOGGER.info( + f"Failed to launch enough instances from public and reservations for {fn_name}." + ) + if instances: + LOGGER.info( + f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..." + ) + ec2_client.terminate_instances( + InstanceIds=[instance_info["InstanceId"] for instance_info in instances] + ) + + except ClientError as e: + # Clean up any remaining instances + LOGGER.info( + f"Failed to launch EFA instance for {fn_name} from reservation due to {e}\n" + "Checking additional open reservations and cleaning up stray resources" + ) + if instances: + LOGGER.info( + f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..." + ) + ec2_client.terminate_instances( + InstanceIds=[instance_info["InstanceId"] for instance_info in instances] + ) + + except Exception as e: + if instances: + LOGGER.info( + f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..." + ) + ec2_client.terminate_instances( + InstanceIds=[instance_info["InstanceId"] for instance_info in instances] + ) + raise HeterogenousReservationError("Failed to launch via heterogenous approach") from e + return [] + + +@retry( + reraise=True, + stop=stop_after_delay(30 * 60), # Keep retrying for 30 minutes + wait=wait_random_exponential(min=60, max=5 * 60), # Retry after waiting 1-10 minutes +) +def launch_efa_instances_with_retry( + ec2_client, + ec2_instance_type, + availability_zone_options, + ec2_run_instances_definition, + fn_name="", +): + """ + Helper function to launch EFA-capable EC2 instances with retry capability, to allow + multiple attempts when facing instance capacity issues. + :param ec2_client: boto3 EC2 Client object + :param ec2_instance_type: str EC2 Instance Type + :param availability_zone_options: list of availability zones in which to try to run instances + :param ec2_run_instances_definition: dict of parameters to pass to ec2_client.run_instances + :param fn_name: string - function name for ease of logging + :return: dict response from ec2_client.run_instances + """ + region = ec2_client.meta.region_name + LOGGER.info(f"Trying to launch {ec2_instance_type} for {fn_name} via capacity reservation...") + + heterogenous_reservation_launch = launch_efa_with_heterogenous_reservations( + ec2_client=ec2_client, + ec2_run_instances_definition=ec2_run_instances_definition, + fn_name=fn_name, + ) + + if heterogenous_reservation_launch: + return heterogenous_reservation_launch + + LOGGER.info( + f"Looks like you didn't have an EFA reservation for {fn_name}, let's see if we can seat you as a walk-in..." + ) + + instances = [] + for availability_zone in availability_zone_options: + try: + instances = launch_efa( + ec2_client, ec2_instance_type, ec2_run_instances_definition, availability_zone + ) + if instances: + break + except ClientError as e: + LOGGER.info( + f"Failed to launch in {availability_zone} for {fn_name} due to {e}\n" + "Retrying in the next availability zone." + ) + continue + if not instances: + raise RuntimeError( + f"Unable to launch {ec2_instance_type} instances in {region} for {fn_name}" + ) + return instances + + +def get_ec2_client(region): + return boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10})) + + +def get_instance_from_id(instance_id, region=DEFAULT_REGION): + """ + Get instance information using instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: Information about instance with matching instance ID + """ + if not instance_id: + raise Exception("No instance id provided") + client = boto3.Session(region_name=region).client("ec2") + instance = client.describe_instances(InstanceIds=[instance_id]) + if not instance: + raise Exception( + "Unable to launch the instance. \ + Did not return any reservations object" + ) + return instance["Reservations"][0]["Instances"][0] + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def get_private_ip(instance_id, region=DEFAULT_REGION): + """ + Get Private IP of instance using instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: Private IP Address of instance with matching instance ID + """ + instance = get_instance_from_id(instance_id, region) + if not instance["PrivateIpAddress"]: + raise Exception("Private IP address not yet available") + return instance["PrivateIpAddress"] + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def get_public_ip(instance_id, region=DEFAULT_REGION): + """ + Get Public IP of instance using instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: IP Address of instance with matching instance ID + """ + instance = get_instance_from_id(instance_id, region) + if not instance["PublicIpAddress"]: + raise Exception("IP address not yet available") + return instance["PublicIpAddress"] + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def get_public_ip_from_private_dns(private_dns, region=DEFAULT_REGION): + """ + Get Public IP of instance using private DNS + :param private_dns: + :param region: + :return: IP Address of instance with matching private DNS + """ + client = boto3.Session(region_name=region).client("ec2") + response = client.describe_instances( + Filters={"Name": "private-dns-name", "Value": [private_dns]} + ) + return response.get("Reservations")[0].get("Instances")[0].get("PublicIpAddress") + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def get_instance_user(instance_id, region=DEFAULT_REGION): + """ + Get "ubuntu" or "ec2-user" based on AMI used to launch instance + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: user name + """ + instance = get_instance_from_id(instance_id, region) + user = "ubuntu" if instance["ImageId"] in UL_AMI_LIST else "ec2-user" + return user + + +def get_instance_state(instance_id, region=DEFAULT_REGION): + """ + Get state of instance using instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: State of instance with matching instance ID + """ + instance = get_instance_from_id(instance_id, region) + return instance["State"]["Name"] + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def check_instance_state(instance_id, state="running", region=DEFAULT_REGION): + """ + Compares the instance state with the state argument. + Retries 8 times with 120 seconds gap between retries. + :param instance_id: Instance ID to be queried + :param state: Expected instance state + :param region: Region where query will be performed + :return: State of instance with matching instance ID + """ + instance_state = get_instance_state(instance_id, region) + if state != instance_state: + raise Exception(f"Instance {instance_id} not in {state} state") + return instance_state + + +def get_system_state(instance_id, region=DEFAULT_REGION): + """ + Returns health checks state for instances + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: System state and Instance state of instance with matching instance ID + """ + if not instance_id: + raise Exception("No instance id provided") + client = boto3.Session(region_name=region).client("ec2") + response = client.describe_instance_status(InstanceIds=[instance_id]) + if not response: + raise Exception( + "Unable to launch the instance. \ + Did not return any reservations object" + ) + instance_status_list = response["InstanceStatuses"] + if not instance_status_list: + raise Exception( + "Unable to launch the instance. \ + Did not return any reservations object" + ) + if len(instance_status_list) < 1: + raise Exception( + "The instance id seems to be incorrect {}. \ + reservations seems to be empty".format( + instance_id + ) + ) + + instance_status = instance_status_list[0] + return ( + instance_status["SystemStatus"]["Status"], + instance_status["InstanceStatus"]["Status"], + ) + + +@retry(stop=stop_after_attempt(96), wait=wait_fixed(10)) +def check_system_state( + instance_id, system_status="ok", instance_status="ok", region=DEFAULT_REGION +): + """ + Compares the system state (Health Checks). + Retries 96 times with 10 seconds gap between retries + :param instance_id: Instance ID to be queried + :param system_status: Expected system state + :param instance_status: Expected instance state + :param region: Region where query will be performed + :return: System state and Instance state of instance with matching instance ID + """ + instance_state = get_system_state(instance_id, region=region) + if system_status != instance_state[0] or instance_status != instance_state[1]: + raise Exception( + "Instance {} not in \ + required state".format( + instance_id + ) + ) + return instance_state + + +def terminate_instance(instance_id, region=DEFAULT_REGION): + """ + Terminate EC2 instances with matching instance ID + :param instance_id: Instance ID to be terminated + :param region: Region where instance is located + """ + if not instance_id: + raise Exception("No instance id provided") + client = boto3.Session(region_name=region).client("ec2") + response = client.terminate_instances(InstanceIds=[instance_id]) + if not response: + raise Exception("Unable to terminate instance. No response received.") + instances_terminated = response["TerminatingInstances"] + if not instances_terminated: + raise Exception("Failed to terminate instance.") + if instances_terminated[0]["InstanceId"] != instance_id: + raise Exception("Failed to terminate instance. Unknown error.") + + +def get_instance_type_details(instance_type, region=DEFAULT_REGION): + """ + Get instance type details for a given instance type + :param instance_type: Instance type to be queried + :param region: Region where query will be performed + :return: Information about instance type + """ + client = boto3.client("ec2", region_name=region) + response = client.describe_instance_types(InstanceTypes=[instance_type]) + if not response or not response["InstanceTypes"]: + raise Exception("Unable to get instance details. No response received.") + if response["InstanceTypes"][0]["InstanceType"] != instance_type: + raise Exception( + f"Bad response received. Requested {instance_type} " + f"but got {response['InstanceTypes'][0]['InstanceType']}" + ) + return response["InstanceTypes"][0] + + +def get_instance_details(instance_id, region=DEFAULT_REGION): + """ + Get instance details for instance with given instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: Information about instance with matching instance ID + """ + if not instance_id: + raise Exception("No instance id provided") + instance = get_instance_from_id(instance_id, region=region) + if not instance: + raise Exception("Could not find instance") + + return get_instance_type_details(instance["InstanceType"], region=region) + + +@retry(stop=stop_after_attempt(30), wait=wait_fixed(10)) +def get_instance_num_cpus(instance_id, region=DEFAULT_REGION): + """ + Get number of VCPUs on instance with given instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: Number of VCPUs on instance with matching instance ID + """ + instance_info = get_instance_details(instance_id, region=region) + return instance_info["VCpuInfo"]["DefaultVCpus"] + + +@retry(stop=stop_after_attempt(30), wait=wait_fixed(10)) +def get_instance_memory(instance_id, region=DEFAULT_REGION): + """ + Get total RAM available on instance with given instance ID + :param instance_id: Instance ID to be queried + :param region: Region where query will be performed + :return: Total RAM available on instance with matching instance ID + """ + instance_info = get_instance_details(instance_id, region=region) + return instance_info["MemoryInfo"]["SizeInMiB"] + + +@retry(stop=stop_after_attempt(30), wait=wait_fixed(10)) +def get_instance_num_inferentias(instance_id=None, instance_type=None, region=DEFAULT_REGION): + """ + Get total number of neurons on instance with given instance ID + :param instance_id: Instance ID to be queried + :param instance_type: Instance Type to be queried + :param region: Region where query will be performed + :return: Number of neurons on instance with matching instance ID + """ + assert instance_id or instance_type, "Input must be either instance_id or instance_type" + instance_info = ( + get_instance_type_details(instance_type, region=region) + if instance_type + else get_instance_details(instance_id, region=region) + ) + return sum( + neuron_type["Count"] + for neuron_type in instance_info["InferenceAcceleratorInfo"]["Accelerators"] + if neuron_type["Name"] == "Inferentia" + ) + + +@retry(stop=stop_after_attempt(30), wait=wait_fixed(10)) +def get_instance_num_gpus(instance_id=None, instance_type=None, region=DEFAULT_REGION): + """ + Get total number of GPUs on instance with given instance ID + :param instance_id: Instance ID to be queried + :param instance_type: Instance Type to be queried + :param region: Region where query will be performed + :return: Number of GPUs on instance with matching instance ID + """ + assert instance_id or instance_type, "Input must be either instance_id or instance_type" + instance_info = ( + get_instance_type_details(instance_type, region=region) + if instance_type + else get_instance_details(instance_id, region=region) + ) + return sum(gpu_type["Count"] for gpu_type in instance_info["GpuInfo"]["Gpus"]) + + +@retry(stop=stop_after_attempt(30), wait=wait_fixed(10)) +def get_num_efa_interfaces_for_instance_type(instance_type, region=DEFAULT_REGION): + """ + Get the maximum number of EFA interfaces available on a particular instance type + :param instance_type: str EC2 Instance type + :param region: str Region where ec2 instance must be launched + :return: NoneType/int Number of EFA interfaces that can be created on the given instance type. + Can be None if instance_type doesn't support EFA. + """ + instance_info = get_instance_type_details(instance_type, region) + num_efa_interfaces = ( + instance_info.get("NetworkInfo", {}).get("EfaInfo", {}).get("MaximumEfaInterfaces") + ) + return num_efa_interfaces + + +def get_ec2_fabric_connection(instance_id, instance_pem_file, region): + """ + establish connection with EC2 instance if necessary + :param instance_id: ec2_instance id + :param instance_pem_file: instance key name + :param region: Region where ec2 instance is launched + :return: Fabric connection object + """ + user = get_instance_user(instance_id, region=region) + conn = Connection( + user=user, + host=get_public_ip(instance_id, region), + connect_kwargs={"key_filename": [instance_pem_file]}, + connect_timeout=18000, + ) + return conn + + +def get_ec2_instance_tags(instance_id, region=DEFAULT_REGION, ec2_client=None): + ec2_client = ec2_client or get_ec2_client(region) + response = ec2_client.describe_tags(Filters=[{"Name": "resource-id", "Values": [instance_id]}]) + return {tag["Key"]: tag["Value"] for tag in response.get("Tags")} + + +# If IMDSv2 is enforced on EC2 instance with hop limit 1 then IMDSv2 api calls doesn't work +# If IMDSv2 is enforced on EC2 instance with hop limit > 1 then IMDSv2 api calls work +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def enforce_IMDSv2(instance_id, hop_limit, region=DEFAULT_REGION, ec2_client=None): + """ + Enable HTTP TOKENS required option on EC2 instance with given hop limit. + + :param instance_id: str, ec2 instance id + :param region: str, Region where ec2 instance is launched. + :param ec2_client: str, ec2 client. + :param hop_limit: str, hop limit to be set on ec2 instance. + """ + ec2_client = ec2_client or get_ec2_client(region) + response = ec2_client.modify_instance_metadata_options( + InstanceId=instance_id, + HttpTokens="required", + HttpPutResponseHopLimit=hop_limit, + HttpEndpoint="enabled", + ) + + if not response: + raise Exception("Unable to enforce IMDSv2. No response received.") + + time.sleep(2) + state = None + if response["InstanceId"]: + res = ec2_client.describe_instances(InstanceIds=[instance_id]) + if res: + metadata_options = res["Reservations"][0]["Instances"][0]["MetadataOptions"] + state = metadata_options["State"] + LOGGER.info(f"Modify Metadata options of EC2 instance: {metadata_options}") + if state != "applied": + raise Exception( + "Unable to enforce IMDSv2. Describe instance is not able to confirm if IMDSv2 enforced." + ) + + +@retry(stop=stop_after_attempt(16), wait=wait_fixed(60)) +def enforce_IMDSv1(instance_id, region=DEFAULT_REGION, ec2_client=None): + """ + Enabled IMDSv1 on EC2 instance. + + :param instance_id: str, ec2 instance id + :param region: str, Region where ec2 instance is launched. + :param ec2_client: str, ec2 client. + :param hop_limit: str, hop limit to be set on ec2 instance. + """ + ec2_client = ec2_client or get_ec2_client(region) + response = ec2_client.modify_instance_metadata_options( + InstanceId=instance_id, HttpTokens="optional", HttpPutResponseHopLimit=1 + ) + + if not response: + raise Exception("Unable to enforce IMDSv1. No response received.") + time.sleep(2) + state = None + if response["InstanceId"]: + res = ec2_client.describe_instances(InstanceIds=[instance_id]) + if res: + metadata_options = res["Reservations"][0]["Instances"][0]["MetadataOptions"] + state = metadata_options["State"] + LOGGER.info(f"Modify Metadata options of EC2 instance: {metadata_options}") + if state != "applied": + raise Exception( + "Unable to enforce IMDSv1. Describe instance is not able to confirm if IMDSv1 enforced." + ) + + +def fetch_s3_file_and_get_last_line(s3_location, local_filename="temp.txt"): + """ + Fetches the s3 file locally and extracts its last line. + + :param s3_location: str, s3 uri + :param local_filename: str, location where s3 file is to be downloaded locally. + :return: str, The last line of the file + """ + run(f"rm -rf {local_filename}", hide=True) + run(f"aws s3 cp {s3_location} {local_filename}", hide=True) + last_line_of_file = run(f"tail -n1 {local_filename}", hide=True).stdout.strip() + return last_line_of_file + + +def execute_asynchronus_testing_using_s3_bucket( + connection, + execution_command, + connection_timeout, + required_log_ending, + loop_time=2.5 * 3600, + log_location_within_ec2="~/container_tests/logs.txt", + s3_uri_for_saving_permanent_logs=None, + hang_detection_window=3, +): + """ + This method uses fabric to run the provided execution_command in asynchronus mode. While the execution command + is being executed in the image, it keeps on uploading the logs to the s3 bucket at fixed intervals. After a + loop_time is over, it checks the last line of the uploaded logs to see if it is same as required_log_ending. + This is mainly used in cases where Fabric behaves in an undesired way due to long living connections. + + :param connection: Fabric connection object + :param execution_command: str, command that connection.run() will execute + :param connection_timeout: timeout for fabric connection + :param required_log_ending: str, The string that is desired to be present at the end of the logs + :param loop_time: int, seconds for which we would wait for the tests to execute on ec2 instance + :param log_location_within_ec2: Location within ec2 instance where the logs are being witten. + :param s3_uri_for_saving_permanent_logs: Location where permanent s3 logs could be saved. + :param hang_detection_window: int, This method detects a hang if length of log file does not change for hang_detection_window number of iterations. + """ + account_id = os.getenv("ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"]) + s3_bucket_name = f"dlc-async-test-{account_id}" + if not s3_uri_for_saving_permanent_logs: + unique_id = str(uuid.uuid4()) + unique_id_with_timestamp = f"{unique_id}-{int(time.time())}" + s3_location = f"s3://{s3_bucket_name}/{unique_id_with_timestamp}.txt" + else: + s3_location = s3_uri_for_saving_permanent_logs + connection.run(execution_command, hide=True, timeout=connection_timeout, asynchronous=True) + start_time = int(time.time()) + loop_count = 0 + local_filename = s3_location.replace(":", "-").replace("/", "-") + last_line_of_log = "" + line_count_list = [] + while (int(time.time()) - start_time <= loop_time) and ( + not last_line_of_log.endswith(required_log_ending) + ): + time.sleep(5 * 60) + loop_count += 1 + connection.run( + f"aws s3 cp {log_location_within_ec2} {s3_location}", timeout=connection_timeout + ) + last_line_of_log = fetch_s3_file_and_get_last_line(s3_location, local_filename) + number_of_lines_in_log_file = int( + run(f"wc -l {local_filename}", hide=True).stdout.strip().split()[0] + ) + line_count_list.append(number_of_lines_in_log_file) + number_of_previous_line_counts_to_check = hang_detection_window + if len(line_count_list) >= number_of_previous_line_counts_to_check: + if all( + line_count == line_count_list[-1] + for line_count in line_count_list[-number_of_previous_line_counts_to_check:] + ): + # If last 3 runs lead to same line number then it demonstrates no progress and hence we stop. + LOGGER.info( + f"No progress reported for past {number_of_previous_line_counts_to_check} iterations. Job most likely hanged so stopping the execution!!" + ) + break + LOGGER.info(f"Uploaded file to {s3_location} for {loop_count} number of times") + + if not last_line_of_log.endswith(required_log_ending): + raise ValueError( + f""" Test failed because the last row is not as expected. \n""" + f""" Last row in the log file ===> {last_line_of_log} \n""" + f""" expected ===> {required_log_ending}. \n""" + f""" Full log ===> {s3_location} \n""" + ) + + +def get_s3_uri_for_saving_permanent_logs( + framework, s3_bucket, test_type="ec2", custom_filename=None +): + """ + Helper function to get s3 uri where log files generated within test ec2 instances will be uploaded to. + + :param framework: str, tensorflow, pytorch etc. + :param s3_bucket: str, name of the bucket where we want to upload the logs. + :param test_type: str, type of the test + :param custom_filename: str, custom name of the file that will be prepended with unique id to create the s3 filepath + """ + commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", f"default-{int(time.time())}") + unique_id = str(uuid.uuid4()) + unique_id_with_timestamp = f"{unique_id}-{int(time.time())}" + if custom_filename: + filename = f"{custom_filename}-logs-{unique_id_with_timestamp}.txt" + else: + filename = f"logs-{unique_id_with_timestamp}.txt" + s3_filepath = os.path.join(s3_bucket, test_type, framework, commit_id, filename) + s3_permanent_log_upload_uri = f"s3://{s3_filepath}" + return s3_permanent_log_upload_uri + + +def execute_ec2_training_test( + connection, + ecr_uri, + test_cmd, + region=DEFAULT_REGION, + executable="bash", + large_shm=False, + host_network=False, + container_name="ec2_training_container", + timeout=18000, + bin_bash_entrypoint=False, + enable_habana_async_execution=False, + enable_gdrcopy=False, +): + if executable not in ("bash", "python"): + raise RuntimeError( + f"This function only supports executing bash or python commands on containers" + ) + if executable == "bash": + executable = os.path.join(os.sep, "bin", "bash") + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + container_test_local_dir = os.path.join("$HOME", "container_tests") + synapseai_version = get_synapseai_version_from_tag(ecr_uri) + # Make sure we are logged into ECR so we can pull the image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + # Run training command + shm_setting = '--shm-size="1g"' if large_shm else "" + network = '--network="host" ' if host_network else "" + container_runtime = "--runtime=habana -e HABANA_VISIBLE_DEVICES=all" if "hpu" in ecr_uri else "" + ompi_mca_btl = "-e OMPI_MCA_btl_vader_single_copy_mechanism=none" if "hpu" in ecr_uri else "" + cap_add = "--cap-add=sys_nice" if "hpu" in ecr_uri else "" + ipc = "--ipc=host" if "hpu" in ecr_uri and "pytorch" in ecr_uri else "" + hpu_env_vars = f"-e GIT_BRANCH={synapseai_version}" if "hpu" in ecr_uri else "" + habana_container_test_repo = ( + "-v ${HOME}/gaudi-test-suite:/gaudi-test-suite" if "hpu" in ecr_uri else "" + ) + neuron_device = "--device=/dev/neuron0" if "neuron" in ecr_uri else "" + gdr_device = "--device=/dev/gdrdrv" if enable_gdrcopy else "" + bin_bash_cmd = "--entrypoint /bin/bash " if bin_bash_entrypoint else "" + + LOGGER.info(f"execute_ec2_training_test pulling {ecr_uri}, with cmd {test_cmd}") + connection.run(f"docker pull {ecr_uri}", hide="out") + connection.run( + f"docker run {docker_runtime} --name {container_name} " + f"{container_runtime} {ompi_mca_btl} {cap_add} {hpu_env_vars} " + f"{ipc} {network}-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} " + f"{habana_container_test_repo} {shm_setting} {neuron_device} {gdr_device} -itd {bin_bash_cmd}{ecr_uri}", + hide=True, + ) + + if "habana" in ecr_uri: + execution_command = f"docker exec --user root {container_name} {executable} -c '{test_cmd}'" + required_log_ending = "Kudos!! Habana tests executed successfully" + framework = ( + "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None + ) + test_type = "ec2" + account_id_prefix = os.getenv( + "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] + )[:3] + s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" + s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs( + framework, s3_bucket=s3_bucket_for_permanent_logs, test_type=test_type + ) + if enable_habana_async_execution == True: + execute_asynchronus_testing_using_s3_bucket( + connection, + execution_command, + timeout, + required_log_ending, + loop_time=4 * 3600, + s3_uri_for_saving_permanent_logs=s3_uri_permanent_logs, + hang_detection_window=15, + ) + return + else: + run_output = connection.run(execution_command, hide=True, timeout=timeout) + try: + connection.run(f"aws s3 cp ~/container_tests/logs.txt {s3_uri_permanent_logs}") + LOGGER.info(f"Uploaded logs at: {s3_uri_permanent_logs}") + except: + LOGGER.info(f"Could not upload the logs") + return run_output + + # Hack not sure why but see the following. since not using latest driver yet in the AMI, doing this for now + # [ 214.939271] Neuron Driver Started with Version:2.x.381.0-b70a76a18efb5e89ffed987461e9a1009d8b6f1e + # [ 214.939619] neuron-driver 0000:00:1e.0: BAR 4: can't reserve [mem 0x1000000000-0x17ffffffff 64bit pref] + if "neuron" in ecr_uri: + connection.run(f"sudo modprobe -r neuron && sudo modprobe -i neuron") + + LOGGER.info(f"execute_ec2_training_test running {ecr_uri}, with cmd {test_cmd}") + ec2_res = connection.run( + f"docker exec --user root {container_name} {executable} -c '{test_cmd}'", + hide=True, + timeout=timeout, + ) + LOGGER.info(f"execute_ec2_training_test completed {ecr_uri}, with cmd {test_cmd}") + return ec2_res + + +def execute_ec2_telemetry_test( + connection, + ecr_uri, + call_type, + container_name, + test_cmd, + opt_in=False, + region=DEFAULT_REGION, + timeout=900, +): + """ + Execute telemetry tests on EC2 instances using Docker containers. + + Args: + connection: EC2 connection object + ecr_uri (str): ECR image URI + call_type (str): Type of test to run ('bashrc', 'entrypoint', 'framework') + container_name (str): Base name for the container + test_cmd (str): Test command to execute + opt_in (bool): Whether to run in opt-in mode (default: False) + region (str): AWS region + timeout (int): Timeout in seconds (default: 900) + + Returns: + Result object from the connection.run command + + Raises: + RuntimeError: If invalid call_type is provided + """ + # Validate call type + VALID_CALL_TYPES = {"bashrc", "entrypoint", "framework"} + if call_type not in VALID_CALL_TYPES: + raise RuntimeError(f"Invalid call_type. Must be one of: {', '.join(VALID_CALL_TYPES)}") + + # Set up Docker runtime configuration + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + if "pytorch" in ecr_uri: + framework_env = f"-e FRAMEWORK='torch'" + elif "tensorflow" in ecr_uri: + framework_env = f"-e FRAMEWORK='tensorflow'" + else: + framework_env = "" + opt_out_env = "" if opt_in else "-e OPT_OUT_TRACKING='true'" + + # Set up container and mount configuration + test_suffix = "opt_in" if opt_in else "opt_out" + container_name = ( + f"{container_name}_{call_type}_{test_suffix}" + if call_type in {"bashrc", "entrypoint"} + else f"{container_name}_{call_type}" + ) + + container_test_local_dir = os.path.join("$HOME", "container_tests") + mount_path = f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')}" + + # Prepare test command + test_cmd = f"{test_cmd} {call_type} {test_suffix}" + LOGGER.info(f"Executing test: {test_cmd}") + + # for entrypoint test, we aviod invoking bashrc telemetry + nobashrc_cmd = f"bash --norc" if call_type == "entrypoint" else "" + + # for other tests, we need to aviod using entrypoint telemetry + entrypoint_override = f"--entrypoint /bin/bash" if call_type != "entrypoint" else "" + + try: + # Login to ECR and pull image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + LOGGER.info(f"Pulling image: {ecr_uri}") + connection.run(f"docker pull {ecr_uri}", hide="out") + + # Execute test based on call type + # Start container + connection.run( + f"docker run {docker_runtime} --name {container_name} " + f" {mount_path} " + f"-itd -e TEST_MODE='1' {framework_env} {opt_out_env} {entrypoint_override} {ecr_uri} {nobashrc_cmd}", + hide=True, + ) + + # Execute test command + ec2_res = connection.run( + f"docker exec --user root {container_name} bash -c '{test_cmd}'", + hide=True, + timeout=timeout, + ) + + LOGGER.info(f"Test completed for {call_type} on {ecr_uri}") + return ec2_res + + except Exception as e: + LOGGER.error(f"Test failed: {str(e)}") + raise + + +def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REGION): + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + container_test_local_dir = os.path.join("$HOME", "container_tests") + + # Make sure we are logged into ECR so we can pull the image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + # Run training command + connection.run( + f"docker run {docker_runtime} --name ec2_inference_container -v {container_test_local_dir}:{os.path.join(os.sep, 'test')}" + f" -itd {ecr_uri} bash", + hide=True, + ) + connection.run( + f"docker exec --user root ec2_inference_container {os.path.join(os.sep, 'bin', 'bash')} -c '{test_cmd}'", + hide=True, + timeout=3000, + ) + + +def execute_ec2_training_performance_test( + connection, + ecr_uri, + test_cmd, + region=DEFAULT_REGION, + post_process=None, + data_source="", + threshold=None, +): + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + container_test_local_dir = os.path.join("$HOME", "container_tests") + + timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") + log_name = ( + f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" + ) + log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name) + + # Make sure we are logged into ECR so we can pull the image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + connection.run(f"docker pull {ecr_uri}", hide=True) + + # Run training command, display benchmark results to console + connection.run( + f"docker run {docker_runtime} --user root " + f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} " + f"-e PR_CONTEXT={1 if is_pr_context() else 0} " + f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {ecr_uri} " + f"{os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}" + ) + ec2_performance_upload_result_to_s3_and_validate( + connection, + ecr_uri, + log_location, + data_source, + threshold, + post_process, + log_name, + ) + + +def execute_ec2_habana_training_performance_test( + connection, + ecr_uri, + test_cmd, + region=DEFAULT_REGION, + data_source="", + cards_num=None, + timeout=18000, +): + container_test_local_dir = os.path.join("$HOME", "container_tests") + + timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") + log_name = ( + f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" + ) + synapseai_version = get_synapseai_version_from_tag(ecr_uri) + # Make sure we are logged into ECR so we can pull the image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + + connection.run(f"docker pull -q {ecr_uri}") + + container_runtime = "--runtime=habana -e HABANA_VISIBLE_DEVICES=all" + hpu_env_vars = f"-e CARDS_NUM={cards_num} -e GIT_BRANCH={synapseai_version}" + ompi_mca_btl = "-e OMPI_MCA_btl_vader_single_copy_mechanism=none" + cap_add = "--cap-add=sys_nice" + ipc = "--ipc=host" if "pytorch" in ecr_uri else "" + habana_container_test_repo = "${HOME}/gaudi-test-suite:/gaudi-test-suite" + execution_command = ( + f"docker run --user root " + f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} " + f"-e PR_CONTEXT={1 if is_pr_context() else 0} " + f"{container_runtime} {ompi_mca_btl} {hpu_env_vars} {cap_add} {ipc} " + f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} -v {habana_container_test_repo} " + f"{ecr_uri} {os.path.join(os.sep, 'bin', 'bash')} -c '{test_cmd}'" + ) + + framework = ( + "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None + ) + account_id_prefix = os.getenv( + "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] + )[:3] + s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" + test_type = "benchmark" + custom_filename = test_cmd.split(f"{os.sep}")[-1] + custom_filename += f"-cards-{cards_num}" if cards_num else "-cards-0" + s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs( + framework, + s3_bucket=s3_bucket_for_permanent_logs, + test_type=test_type, + custom_filename=custom_filename, + ) + required_log_ending = "Kudos!! Habana tests executed successfully" + execute_asynchronus_testing_using_s3_bucket( + connection, + execution_command, + timeout, + required_log_ending, + loop_time=4 * 3600, + s3_uri_for_saving_permanent_logs=s3_uri_permanent_logs, + hang_detection_window=15, + ) + LOGGER.info(f"Uploaded logs at: {s3_uri_permanent_logs}") + return + + +def execute_ec2_inference_performance_test( + connection, + ecr_uri, + test_cmd, + region=DEFAULT_REGION, + post_process=None, + data_source="", + threshold=None, +): + docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else "" + container_test_local_dir = os.path.join("$HOME", "container_tests") + timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") + log_name = ( + f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" + ) + # Make sure we are logged into ECR so we can pull the image + account_id = get_account_id_from_image_uri(ecr_uri) + login_to_ecr_registry(connection, account_id, region) + connection.run(f"docker pull -q {ecr_uri}") + + # Run training command, display benchmark results to console + repo_name, image_tag = ecr_uri.split("/")[-1].split(":") + container_name = f"{repo_name}-performance-{image_tag}-ec2" + connection.run( + f"docker run {docker_runtime} -d --name {container_name} " + f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} " + f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {ecr_uri}" + ) + try: + connection.run( + f"docker exec --user root {container_name} " + f"{os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}" + ) + except Exception as e: + raise Exception("Failed to exec benchmark command.\n", e) + finally: + connection.run(f"docker rm -f {container_name}") + log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name) + ec2_performance_upload_result_to_s3_and_validate( + connection, + ecr_uri, + log_location, + data_source, + threshold, + post_process, + log_name, + ) + + +def ec2_performance_upload_result_to_s3_and_validate( + connection, + ecr_uri, + log_location, + data_source, + threshold, + post_process, + log_name, + instance_type=None, +): + framework = ( + "tensorflow" if "tensorflow" in ecr_uri else "mxnet" if "mxnet" in ecr_uri else "pytorch" + ) + framework_version = re.search(r"\d+(\.\d+){2}", ecr_uri).group() + py_version = "py2" if "py2" in ecr_uri else "py37" if "py37" in ecr_uri else "py3" + processor = "gpu" if "gpu" in ecr_uri else "cpu" + work_type = "training" if "training" in ecr_uri else "inference" + s3_location = os.path.join( + BENCHMARK_RESULTS_S3_BUCKET, + framework, + framework_version, + "ec2", + work_type, + processor, + py_version, + log_name, + ) + params = {"connection": connection, "log_location": log_location} + if "threshold" in signature(post_process).parameters: + params["threshold"] = threshold + performance_number = post_process(**params) + unit = ( + "s" + if work_type == "inference" and framework == "tensorflow" + else ( + "ms" + if work_type == "inference" and framework == "pytorch" + else ( + "s/epoch" + if work_type == "training" and framework == "pytorch" and data_source == "imagenet" + else "images/sec" + ) + ) + ) + description = "p99 latency " if unit == "s" or unit == "ms" else "" + for k, v in performance_number.items(): + performance_statement = ( + f"{framework} {framework_version} ec2 {work_type} {processor} {py_version} " + f"{instance_type if instance_type else ''} {data_source} {k} {description}: {v} {unit}, threshold: {threshold[k]} {unit}" + ) + connection.run(f"echo {performance_statement} | sudo tee -a {log_location}") + LOGGER.info(f"{performance_statement}") + connection.run(f"aws s3 cp {log_location} {s3_location}") + LOGGER.info(f"To retrieve complete benchmark log, check {s3_location}") + + def _assertion_results(): + if "Cost" in performance_number: + return performance_number["Cost"] < threshold["Cost"] + if "Throughput" in performance_number: + return performance_number["Throughput"] > threshold["Throughput"] + if len(performance_number) == 0: + return False + failure_count = 0 + for k, v in performance_number.items(): + if v > threshold[k]: + failure_count += 1 + return failure_count <= 2 + + for _ in performance_number: + assert _assertion_results(), ( + f"{framework} {framework_version} ec2 {work_type} {processor} {py_version} {data_source} " + f"Benchmark Result {performance_number} does not reach the threshold {threshold}" + ) + + +def post_process_inference(connection, log_location, threshold): + log_content = connection.run(f"cat {log_location}").stdout.split("\n") + performance_number = {} + for line in log_content: + if "p99" in line: + for key in threshold.keys(): + if key in line: + performance_number[key] = float( + re.search( + r"(p99[ ]*(Latency)?[ ]*:[ ]*)(?P[0-9]+\.?[0-9]+)", + line, + ).group("result") + ) + break + return performance_number + + +def post_process_mxnet_ec2_performance(connection, log_location): + log_content = connection.run(f"cat {log_location}").stdout.split("\n") + total = 0.0 + n = 0 + for line in log_content: + if "samples/sec" in line and "warmup" not in line: + throughput = re.search(r"((?P[0-9]+\.?[0-9]+)[ ]+samples/sec)", line).group( + "throughput" + ) + total += float(throughput) + n += 1 + if total and n: + return {"Throughput": total / n} + else: + raise ValueError("total: {}; n: {} -- something went wrong".format(total, n)) + + +def install_python_in_instance(context, python_version="3.9"): + """ + Install python on DLAMI EC2 instances to create a consistent test environment that is agnostic to AMI used for test. + This helper function assumes that the EC2 instance uses a DLAMI. The /etc/profile.d/dlami.sh file doesn't exist + in other AMIs. If support for other AMIs is needed, this function will need to be updated. + :param context: Invoke Context / Fabric Connection object + :param python_version: str python version to install, such as 3.8, 3.9, etc. + :return: None + """ + if context.run("pyenv --version", warn=True, hide=True).failed: + context.run( + """ls ~/.pyenv || git clone https://github.com/pyenv/pyenv.git ~/.pyenv""", hide=True + ) + + # for images that do not have /etc/profile.d/dlami.sh, we will make it here + if context.run("test -f /etc/profile.d/dlami.sh", warn=True, hide=True).failed: + LOGGER.info("/etc/profile.d/dlami.sh does not exist. Making...") + context.run("sudo touch /etc/profile.d/dlami.sh") + LOGGER.info("adding /etc/profile.d/dlami.sh to .bashrc") + context.run( + """echo '[ -z "$PS1" ] && source /etc/profile.d/dlami.sh'|cat - ~/.bashrc > ~/temprc """ + """&& mv ~/temprc ~/.bashrc""", + hide=True, + ) + + context.run("sudo chmod 666 /etc/profile.d/dlami.sh", hide=True) + context.run( + """echo 'export PYENV_ROOT="$HOME/.pyenv"' >> /etc/profile.d/dlami.sh""", hide=True + ) + context.run( + """echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> /etc/profile.d/dlami.sh""", + hide=True, + ) + context.run("""echo 'eval "$(pyenv init -)"' >> /etc/profile.d/dlami.sh""", hide=True) + context.run("sudo chmod 644 /etc/profile.d/dlami.sh", hide=True) + context.run("sudo dnf update -y", hide=True) + context.run( + ( + "sudo dnf install -y make gcc gcc-c++ openssl-devel zlib-devel " + "bzip2-devel readline-devel sqlite-devel llvm " + "ncurses-devel xz tk-devel libxml2-devel xmlsec1-devel libffi-devel xz-devel --skip-broken" + ), + hide=True, + ) + + context.run(f"pyenv install {python_version}", hide=True) + context.run(f"pyenv global {python_version}", hide=True) + + # Validate that installed python version is the same as requested python version + python_version_response = context.run("python --version", hide=True) + python_version_match = re.search(r"Python (\d+(\.\d+)+)", python_version_response.stdout) + assert python_version_match, "Running 'python --version' returned None" + installed_python_version = python_version_match.group(1) + # Use SpecifierSet("=={python_version}.*") to accommodate python_version of the form X.Y as well as X.Y.Z + assert Version(installed_python_version) in SpecifierSet( + f"=={python_version}.*" + ), f"Installed python version {installed_python_version} does not match required python_version {python_version}" + + +def get_availability_zone_ids(ec2_client): + """ + Obtain list of AZs in a particular region using ec2_client + :param ec2_client: boto3 EC2 Client object + :return: list of str AZ names + """ + response = ec2_client.describe_availability_zones() + return [az["ZoneName"] for az in response["AvailabilityZones"]] + + +def get_default_vpc_id(ec2_client): + """ + Get vpd-id of default VPC in a particular region using ec2_client in that region + :param ec2_client: boto3 EC2 Client object + :return: str Default vpc-id + """ + response = ec2_client.describe_vpcs(Filters=[{"Name": "is-default", "Values": ["true"]}]) + default_vpc_id = response["Vpcs"][0]["VpcId"] + return default_vpc_id + + +def get_default_security_group_id(ec2_client): + """ + Get security-group-id of default SG on the default VPC in a particular region using ec2_client + :param ec2_client: boto3 EC2 Client object + :return: str Default security-group-id + """ + default_vpc_id = get_default_vpc_id(ec2_client) + response = ec2_client.describe_security_groups( + GroupNames=["default"], + Filters=[{"Name": "vpc-id", "Values": [default_vpc_id]}], + ) + default_security_group_id = response["SecurityGroups"][0]["GroupId"] + return default_security_group_id + + +def get_efa_enabled_security_group_id(ec2_client): + """ + Get security-group-id of custom EFA-enabled SG in the default VPC in a particular region + :param ec2_client: boto3 EC2 Client object + :return: str security-group-id of SG named "EFA-enabled" + """ + default_vpc_id = get_default_vpc_id(ec2_client) + response = ec2_client.describe_security_groups( + GroupNames=["EFA-enabled"], + Filters=[{"Name": "vpc-id", "Values": [default_vpc_id]}], + ) + + efa_security_group_id = response["SecurityGroups"][0]["GroupId"] + return efa_security_group_id + + +def get_default_subnet_for_az(ec2_client, availability_zone): + """ + Get subnet-id associated with a particular AZ using ec2_client for that region + :param ec2_client: boto3 EC2 Client object + :param availability_zone: str Availability Zone name + :return: str subnet-id + """ + response = ec2_client.describe_subnets( + Filters=[ + {"Name": "availability-zone", "Values": [availability_zone]}, + {"Name": "default-for-az", "Values": ["true"]}, + ] + ) + az_subnet_id = response["Subnets"][0]["SubnetId"] + return az_subnet_id + + +def get_subnet_id_by_vpc(ec2_client, vpc_id): + + response = ec2_client.describe_subnets( + Filters=[ + { + "Name": "vpc-id", + "Values": [ + vpc_id, + ], + }, + ], + ) + + subnet_ids = [] + for subnet in response["Subnets"]: + if subnet["SubnetId"] is not None: + subnet_ids.append(subnet["SubnetId"]) + + return subnet_ids + + +def get_vpc_id_by_name(ec2_client, vpc_name): + """ + Get VPC ID by VPC name tag + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str VPC ID of the VPC name + """ + response = ec2_client.describe_vpcs(Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]).get( + "Vpcs", [] + ) + + if not response: + raise Exception(f"No VPC found with Name tag: {vpc_name}") + elif len(response) > 1: + raise Exception(f"Multiple VPCs found with Name tag: {vpc_name}") + + vpc_id = response[0]["VpcId"] + + return vpc_id + + +def get_default_security_group_id_by_vpc_id(ec2_client, vpc_name): + """ + Get default SG ID for a non-default VPC + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the default SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": ["default"]}, + ], + ) + + security_group_id = response["SecurityGroups"][0]["GroupId"] + return security_group_id + except Exception as e: + LOGGER.error(f"Error in get_default_security_group_id_by_vpc_id: {str(e)}") + raise + + +def get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name): + """ + Get EFA-enabled SG ID for IPv6 VPC by identifying security groups that allow + all traffic within themselves + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :return: str SG ID of the EFA-enabled SG + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + response = ec2_client.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + ] + ) + + for sg in response["SecurityGroups"]: + inbound_all_traffic = any( + rule["IpProtocol"] == "-1" + and any( + pair["GroupId"] == sg["GroupId"] for pair in rule.get("UserIdGroupPairs", []) + ) + for rule in sg["IpPermissions"] + ) + + outbound_all_traffic = any( + rule["IpProtocol"] == "-1" + and any( + pair["GroupId"] == sg["GroupId"] for pair in rule.get("UserIdGroupPairs", []) + ) + for rule in sg["IpPermissionsEgress"] + ) + + if inbound_all_traffic and outbound_all_traffic: + return sg["GroupId"] + + raise ValueError( + f"No EFA-enabled security group found in VPC {vpc_name}. Expected a sg that allows all traffic to and from itself." + ) + except Exception as e: + LOGGER.error(f"Error when getting IPv6 EFA-enabled sg id: {str(e)}") + raise + + +def get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone): + """ + Get IPv6-enabled subnet ID in the a particular availability zone + :param ec2_client: boto3 EC2 Client object + :param vpc_name: Name tag value of the VPC + :param availability_zone: str AZ name + :return: str Subnet ID of an IPv6-enabled subnet + """ + try: + vpc_id = get_vpc_id_by_name(ec2_client, vpc_name) + + route_tables = ec2_client.describe_route_tables( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + )["RouteTables"] + + response = ec2_client.describe_subnets( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "availability-zone", "Values": [availability_zone]}, + ] + ) + + ipv6_subnets = [ + subnet + for subnet in response["Subnets"] + if subnet.get("Ipv6CidrBlockAssociationSet") + and is_public_subnet(subnet["SubnetId"], route_tables) + ] + + if not ipv6_subnets: + raise Exception( + f"No IPv6-enabled subnet found in AZ {availability_zone} for VPC {vpc_id}" + ) + + return ipv6_subnets[0]["SubnetId"] + except Exception as e: + LOGGER.error( + f"Error in when getting IPv6 enabled subnet for AZ {availability_zone}: {str(e)}" + ) + raise + + +def is_public_subnet(subnet_id, route_tables): + """ + Check if a subnet is public by verifying if it has a route table with an Internet Gateway + that routes all IPv4 or IPv6 traffic + :param subnet_id: str the subnet ID to check + :param route_tables: list route tables from the VPC + :return: True if subnet is public, False otherwise + """ + for route_table in route_tables: + has_igw = False + for route in route_table.get("Routes", []): + if route.get("GatewayId", "").startswith("igw-"): + if ( + route.get("DestinationCidrBlock") == "0.0.0.0/0" + or route.get("DestinationIpv6CidrBlock") == "::/0" + ): + has_igw = True + break + if not has_igw: + continue + + # check if subnet is associated with route table + for association in route_table.get("Associations", []): + if association.get("SubnetId") == subnet_id: + return True + + return False + + +def generate_standard_dual_stack_network_interface(ec2_client, availability_zone): + """ + Generate network interface configuration for dual-stack (IPv4/IPv6) instances. + :param ec2_client: boto3 EC2 Client + :param availability_zone: str AZ in which the instance must be created + :return: list containing a single network interface configuration for dual-stack + """ + try: + if not IPV6_VPC_NAME: + raise ValueError("IPv6 VPC name is not set") + + ipv6_default_sg = get_default_security_group_id_by_vpc_id(ec2_client, IPV6_VPC_NAME) + ipv6_subnet_id = get_ipv6_enabled_subnet_for_az( + ec2_client, IPV6_VPC_NAME, availability_zone + ) + + network_interfaces = [ + { + "DeviceIndex": 0, + "DeleteOnTermination": True, + "Groups": [ipv6_default_sg], + "SubnetId": ipv6_subnet_id, + "Ipv6AddressCount": 1, + } + ] + + return network_interfaces + + except Exception as e: + LOGGER.error( + f"Failed to generate dual-stack network interface in AZ {availability_zone}: {str(e)}" + ) + raise + + +def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone): + """ + Generate list of EFA-network-interfaces based on the number of network-interfaces available + on a given instance type. + :param ec2_client: boto3 EC2 Client + :param ec2_instance_type: str EC2 Instance Type with network interface to be configured + :param availability_zone: str AZ in which the instance must be created + :return: list of dicts mapping each network-interface available + """ + num_efa_interfaces = get_num_efa_interfaces_for_instance_type(ec2_instance_type) + if not num_efa_interfaces: + raise AttributeError(f"Unable to get number of EFA Interfaces for {ec2_instance_type}") + + if ENABLE_IPV6_TESTING: + vpc_name = IPV6_VPC_NAME + efa_sg = get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name) + sg_ids = [efa_sg] + subnet_id = get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone) + else: + default_sg = get_default_security_group_id(ec2_client) + efa_sg = get_efa_enabled_security_group_id(ec2_client) + sg_ids = [default_sg, efa_sg] + subnet_id = get_default_subnet_for_az(ec2_client, availability_zone) + + network_interfaces = [] + for i in range(num_efa_interfaces): + interface = { + "DeviceIndex": 0 if i == 0 else 1, + "NetworkCardIndex": i, + "DeleteOnTermination": True, + "InterfaceType": "efa", + "Groups": sg_ids, + "SubnetId": subnet_id, + } + + network_interfaces.append(interface) + + return network_interfaces + + +def get_network_interface_id(instance_id, region=DEFAULT_REGION): + """ + Gets the network interface at index 0 from the instance_id. Meant to be used + with p4d instance with 4 efa devices + """ + instance = get_instance_from_id(instance_id, region) + network_interfaces_info = instance["NetworkInterfaces"] + for device in network_interfaces_info: + if device["Attachment"]["DeviceIndex"] == 0: + return device["NetworkInterfaceId"] + + raise Exception("Could not find network device 0, retry operation") + + +def get_ipv6_address_for_eth0(instance_id, region=DEFAULT_REGION): + """ + Gets the IPv6 address specifically from eth0 (Device Index 0) of an EC2 instance + """ + instance = get_instance_from_id(instance_id, region) + network_interfaces_info = instance["NetworkInterfaces"] + for device in network_interfaces_info: + if device["Attachment"]["DeviceIndex"] == 0: + if device["Ipv6Addresses"]: + return device["Ipv6Addresses"][0]["Ipv6Address"] + LOGGER.info(f"No IPv6 address found on eth0 for instance {instance_id}") + return None + + LOGGER.error(f"Could not find eth0 for instance {instance_id}") + return None + + +def attach_elastic_ip(network_interface_id, region="us-east-1", is_ipv6=False): + """ + Creates and attaches an elastic ip to a network interface which is already + attached to an efa enabled device. This is needed specifically for 4 efa devices + attached to a p4d instance. Having multiple network devices prevents automatic + public ip address assignment, so we must do it manually. + """ + ec2_client = boto3.client("ec2", region_name=region) + arguments_dict = { + "Domain": "vpc", + "TagSpecifications": [ + { + "ResourceType": "elastic-ip", + "Tags": [{"Key": "Name", "Value": f"elastic_ip_{network_interface_id}"}], + } + ], + } + elastic_ip = ec2_client.allocate_address(**arguments_dict) + elastic_ip_allocation_id = elastic_ip["AllocationId"] + response = ec2_client.associate_address( + AllocationId=elastic_ip_allocation_id, NetworkInterfaceId=network_interface_id + ) + if is_ipv6: + ec2_client.assign_ipv6_addresses( + NetworkInterfaceId=network_interface_id, Ipv6AddressCount=1 + ) + return elastic_ip_allocation_id + + +def delete_elastic_ips(elastic_ip_allocation_ids, ec2_client): + """ + Deletes elastic ips created for efa p4d testing. + For default VPC (IPv4): can release directly + For non-default VPC (IPv6): need to disassociate before release + """ + for allocation_id in elastic_ip_allocation_ids: + try: + if ENABLE_IPV6_TESTING: + address = ec2_client.describe_addresses(AllocationIds=[allocation_id])["Addresses"][ + 0 + ] + if "AssociationId" in address: + ec2_client.disassociate_address(AssociationId=address["AssociationId"]) + time.sleep(10) + ec2_client.release_address(AllocationId=allocation_id) + except Exception as e: + LOGGER.error(f"Failed to delete elastic ip {allocation_id}: {str(e)}") + + +def create_name_tags_for_instance(instance_id, name_tag, region): + """ + Create name tags for an instance + :param instance_id: str Instance ID on which to apply the given Name tag + :param name_tag: str Name tag to be applied + :param region: str Region in which instance is running + """ + ec2_client = boto3.client("ec2", region_name=region) + response = ec2_client.create_tags( + Resources=[instance_id], + Tags=[{"Key": "Name", "Value": name_tag}], + ) + if not response: + raise Exception( + "Unable to create name tag {0} for the instance {1}".format(name_tag, instance_id) + ) + + +def get_efa_devices_on_instance(connection): + """ + Get list of EFA devices available for use in an instance + :param connection: Fabric Connection object + :return: list of str device paths + """ + response = connection.run("ls /dev/infiniband/uverbs*") + devices = response.stdout.split() + return devices From 2213423d3d86688444c5d4f12b2fa3e4d78cd67d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 12:30:40 -0700 Subject: [PATCH 03/33] add vllm specific ec2 infra changes --- infra/test_infra/ec2/vllm/__init__.py | 0 infra/test_infra/ec2/vllm/fsx_utils.py | 302 +++++++++ infra/test_infra/ec2/vllm/setup_ec2.py | 645 ++++++++++++++++++++ infra/test_infra/ec2/vllm/setup_fsx_vllm.sh | 75 +++ 4 files changed, 1022 insertions(+) create mode 100644 infra/test_infra/ec2/vllm/__init__.py create mode 100644 infra/test_infra/ec2/vllm/fsx_utils.py create mode 100644 infra/test_infra/ec2/vllm/setup_ec2.py create mode 100644 infra/test_infra/ec2/vllm/setup_fsx_vllm.sh diff --git a/infra/test_infra/ec2/vllm/__init__.py b/infra/test_infra/ec2/vllm/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py new file mode 100644 index 000000000000..ce27840fff90 --- /dev/null +++ b/infra/test_infra/ec2/vllm/fsx_utils.py @@ -0,0 +1,302 @@ +import time +from invoke import run +from typing import Dict, List, Any +import boto3 +from botocore.exceptions import ClientError + +from infra.test_infra.test_infra_utils import create_logger + +LOGGER = create_logger(__name__) + + +class FsxSetup: + """ + A utility class for setting up and managing FSx for Lustre filesystems + and related AWS and Kubernetes resources. + + : param region: AWS region where resources will be created (default: "us-west-2") + """ + + def __init__(self, region: str = "us-west-2"): + self.region = region + + def create_fsx_filesystem( + self, + subnet_id: str, + security_group_ids: List[str], + storage_capacity: int, + deployment_type: str, + tags: Dict[str, str], + ): + """ + Create FSx filesystem with given configuration + : param subnet_id: subnet ID where FSx will be created + : param security_group_ids: list of security group IDs + : param storage_capacity: storage capacity in GiB + : param deployment_type: FSx deployment type + : param tags: dictionary of tags to apply to the FSx filesystem + : return: dictionary containing filesystem details + """ + tags_param = " ".join([f"Key={k},Value={v}" for k, v in tags.items()]) + + try: + fsx_id = run( + f"aws fsx create-file-system" + f" --file-system-type LUSTRE" + f" --storage-capacity {storage_capacity}" + f" --subnet-ids {subnet_id}" + f' --security-group-ids {" ".join(security_group_ids)}' + f" --lustre-configuration DeploymentType={deployment_type}" + f" --tags {tags_param}" + f" --file-system-type-version 2.15" + f' --query "FileSystem.FileSystemId"' + f" --output text" + ).stdout.strip() + + LOGGER.info(f"Created FSx filesystem: {fsx_id}") + + filesystem_info = self.wait_for_filesystem(fsx_id) + return filesystem_info + + except Exception as e: + LOGGER.error(f"Failed to create FSx filesystem: {e}") + raise + + def delete_fsx_filesystem(self, fsx_id: str): + + try: + fsx_id = run( + f"aws fsx delete-file-system" + f" --file-system-id {fsx_id}" + f' --query "FileSystem.FileSystemId"' + f" --output text" + ).stdout.strip() + + print(f"Deleted FSx filesystem: {fsx_id}") + + except Exception as e: + LOGGER.error(f"Failed to create FSx filesystem: {e}") + raise + + def wait_for_filesystem(self, filesystem_id: str): + """ + Wait for FSx filesystem to become available and return its details + : param filesystem_id: FSx filesystem ID + : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name) + : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state + """ + print(f"Waiting for FSx filesystem {filesystem_id} to be available...") + while True: + status = run( + f"aws fsx describe-file-systems --file-system-id {filesystem_id} " + f"--query 'FileSystems[0].Lifecycle' --output text" + ).stdout.strip() + + if status == "AVAILABLE": + break + elif status in ["FAILED", "DELETING", "DELETED"]: + raise Exception(f"FSx filesystem entered {status} state") + + print(f"FSx status: {status}, waiting...") + time.sleep(30) + + # get fs DNS and mount name + fsx_dns = run( + f"aws fsx describe-file-systems --file-system-id {filesystem_id} " + f"--query 'FileSystems[0].DNSName' --output text" + ).stdout.strip() + + fsx_mount = run( + f"aws fsx describe-file-systems --file-system-id {filesystem_id} " + f"--query 'FileSystems[0].LustreConfiguration.MountName' --output text" + ).stdout.strip() + + return {"filesystem_id": filesystem_id, "dns_name": fsx_dns, "mount_name": fsx_mount} + + def create_fsx_security_group(self, ec2_cli, vpc_id, group_name, description): + """ + Create a security group for FSx Lustre and add inbound rules. + + :param vpc_id: The ID of the VPC where the security group will be created + :param instance_id: The ID of the newly created EC2 instance + :param region_name: The AWS region name + :return: The ID of the created security group + """ + try: + # Create the security group + response = ec2_cli.create_security_group( + GroupName=group_name, + Description=description, + VpcId=vpc_id, + ) + sg_id = response["GroupId"] + print(f"Created security group: {sg_id}") + + return sg_id + + except ClientError as e: + print(f"An error occurred: {e}") + return None + + def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids): + """ + Add ingress rules to FSx security group for multiple instances + + Args: + ec2_cli: boto3 EC2 client + sg_id: ID of the FSx security group + instance_ids: List of EC2 instance IDs + """ + try: + # Get security group IDs for all instances + instance_sg_ids = set() + for instance_id in instance_ids: + response = ec2_cli.describe_instances(InstanceIds=[instance_id]) + sg_id_instance = response["Reservations"][0]["Instances"][0]["SecurityGroups"][0][ + "GroupId" + ] + instance_sg_ids.add(sg_id_instance) + + instance_group_pairs = [{"GroupId": sg} for sg in instance_sg_ids] + + all_group_pairs = instance_group_pairs + [{"GroupId": sg_id}] + + # Add inbound rules + ec2_cli.authorize_security_group_ingress( + GroupId=sg_id, + IpPermissions=[ + { + "IpProtocol": "tcp", + "FromPort": 988, + "ToPort": 1023, + "UserIdGroupPairs": all_group_pairs, + } + ], + ) + print( + f"Added inbound rules to FSx security group {sg_id} for instance security groups: {instance_sg_ids}" + ) + + except Exception as e: + print(f"Error adding ingress rules: {str(e)}") + raise + + def delete_security_group(self, ec2_cli, group_id: str): + """ + Create a security group in the specified VPC + : param vpc_id: VPC ID where the security group will be created + : param name: name of the security group + : param description: description of the security group + : return: created security group ID + : raises: Exception if security group creation fails + """ + try: + response = ec2_cli.delete_security_group( + GroupId=group_id, + ) + sg_id = response["GroupId"] + print(f"Deleted security group: {sg_id}") + + except Exception as e: + LOGGER.error(f"Failed to delete security group: {e}") + raise + + def setup_csi_driver(self): + """ + Install and configure the AWS FSx CSI Driver in the Kubernetes cluster + : return: None + : raises: Exception if driver installation or verification fails + """ + try: + LOGGER.info("Installing AWS FSx CSI Driver...") + run( + "helm repo add aws-fsx-csi-driver https://kubernetes-sigs.github.io/aws-fsx-csi-driver/" + ) + run("helm repo update") + run( + "helm install aws-fsx-csi-driver aws-fsx-csi-driver/aws-fsx-csi-driver --namespace kube-system" + ) + run( + "kubectl wait --for=condition=ready pod -l app=fsx-csi-controller -n kube-system --timeout=300s" + ) + + self._verify_csi_driver() + LOGGER.info("FSx CSI Driver installed successfully") + except Exception as e: + LOGGER.error(f"Failed to setup FSx CSI driver: {e}") + raise + + def _verify_csi_driver(self): + """ + Verify that FSx CSI driver pods are running correctly in the cluster + : return: None + : raises: Exception if driver pods are not found or not running + """ + result = run("kubectl get pods -n kube-system | grep fsx") + + if "fsx-csi-controller" not in result.stdout or "fsx-csi-node" not in result.stdout: + raise Exception("FSx CSI driver pods not found") + + fsx_pods = [ + line + for line in result.stdout.split("\n") + if ("fsx-csi-controller" in line or "fsx-csi-node" in line) and "Running" in line + ] + + if not fsx_pods: + raise Exception("No running FSx CSI driver pods found") + + LOGGER.info(f"Found {len(fsx_pods)} running FSx CSI driver pods") + + def setup_kubernetes_resources( + self, storage_class_file: str, pv_file: str, pvc_file: str, replacements: Dict[str, str] + ): + """ + Setup Kubernetes FSx resources using provided yaml files and replacements + : param storage_class_file: path to the storage class yaml file + : param pv_file: path to the persistent volume yaml file + : param pvc_file: path to the persistent volume claim yaml file + : param replacements: dictionary of placeholder replacements + Example: {"": "subnet-xxx", "": "sg-xxx"} + : return: None + : raises: Exception if resource creation fails + """ + try: + for file_path in [storage_class_file, pv_file, pvc_file]: + for key, value in replacements.items(): + run(f"sed -i 's|{key}|{value}|g' {file_path}") + + for file_path in [storage_class_file, pv_file, pvc_file]: + run(f"kubectl apply -f {file_path}") + + self.validate_kubernetes_resources() + + except Exception as e: + LOGGER.error(f"Failed to setup Kubernetes FSx resources: {e}") + raise + + def validate_kubernetes_resources(self): + """ + Validate that FSx Kubernetes resources are properly created and bound + : return: True if all resources are validated successfully + : raises: Exception if any resource validation fails + """ + try: + sc_result = run("kubectl get sc fsx-sc") + if "fsx-sc" not in sc_result.stdout or "fsx.csi.aws.com" not in sc_result.stdout: + raise Exception("FSx storage class not created correctly") + + pv_result = run("kubectl get pv fsx-lustre-pv") + if "fsx-lustre-pv" not in pv_result.stdout or "Bound" not in pv_result.stdout: + raise Exception("FSx persistent volume not created correctly") + + pvc_result = run("kubectl get pvc fsx-lustre-pvc") + if "fsx-lustre-pvc" not in pvc_result.stdout or "Bound" not in pvc_result.stdout: + raise Exception("FSx persistent volume claim not created correctly") + + LOGGER.info("FSx Kubernetes resources validated successfully") + return True + + except Exception as e: + LOGGER.error(f"FSx resource validation failed: {e}") + raise diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py new file mode 100644 index 000000000000..5138ce4b20b9 --- /dev/null +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -0,0 +1,645 @@ +import os +import time +import uuid + +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError, WaiterError +from fabric import Connection + +# TODO: decide whether we want to copy test_utils to the new path +from test import test_utils +from test.test_utils import ( + AL2023_BASE_DLAMI_ARM64_US_WEST_2, + DEFAULT_REGION, +) +from infra.test_infra.ec2.utils import ( + get_default_vpc_id, + get_subnet_id_by_vpc, + get_ec2_client, + get_availability_zone_ids, + launch_efa_instances_with_retry, + check_instance_state, + check_system_state, + create_name_tags_for_instance, + get_num_efa_interfaces_for_instance_type, + get_network_interface_id, + attach_elastic_ip, + delete_elastic_ips, +) +from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup +from infra.test_infra.test_infra_utils import create_logger + +LOGGER = create_logger(__name__) + +# Constant to represent default region for boto3 commands +DEFAULT_REGION = "us-west-2" +EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole" +ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + +# V2 test path constants +V2_LOCAL_TEST_PATH = "test/v2" +V2_INSTANCE_PATH = "$HOME/test_v2" +V2_CONTAINER_PATH = "/test_v2" + +TEST_ID = str(uuid.uuid4()) + + +def ec2_instance_ami(region, image): + if "arm64" in image: + return AL2023_BASE_DLAMI_ARM64_US_WEST_2 + + return test_utils.get_dlami_id(region) + + +def ec2_instance_type(image): + if "arm64" in image: + return "g5g.16xlarge" + else: + return "p4d.24xlarge" + + +def availability_zone_options(ec2_client, ec2_instance_type, region): + """ + Parametrize with a reduced list of availability zones for particular instance types for which + capacity has been reserved in that AZ. For other instance types, parametrize with list of all + AZs in the region. + :param ec2_client: boto3 Client for EC2 + :param ec2_instance_type: str instance type for which AZs must be determined + :param region: str region in which instance must be created + :return: list of str AZ names + """ + allowed_availability_zones = None + if ec2_instance_type in ["p4de.24xlarge"]: + if region == "us-east-1": + allowed_availability_zones = ["us-east-1d", "us-east-1c"] + if ec2_instance_type in ["p4d.24xlarge"]: + if region == "us-west-2": + allowed_availability_zones = ["us-west-2b", "us-west-2c"] + if not allowed_availability_zones: + allowed_availability_zones = get_availability_zone_ids(ec2_client) + return allowed_availability_zones + + +def check_ip_rule_exists(security_group_rules, ip_address): + """ + Check if an IP rule exists in security group rules + """ + if not security_group_rules: + return False + + for rule in security_group_rules: + if ( + rule.get("FromPort") == 80 + and rule.get("ToPort") == 80 + and rule.get("IpProtocol") == "tcp" + and "IpRanges" in rule + ): + for ip_range in rule.get("IpRanges", []): + if ip_range.get("CidrIp") == f"{ip_address}/32": + LOGGER.info(f"Found existing rule for IP {ip_address}") + return True + return False + + +def authorize_ingress(ec2_client, group_id, ip_address): + try: + response = ec2_client.describe_security_groups(GroupIds=[group_id]) + if response.get("SecurityGroups") and response["SecurityGroups"]: + existing_rules = response["SecurityGroups"][0].get("IpPermissions", []) + if check_ip_rule_exists(existing_rules, ip_address): + LOGGER.info("Ingress rule already exists, skipping creation.") + return + + ec2_client.authorize_security_group_ingress( + GroupId=group_id, + IpPermissions=[ + { + "IpProtocol": "tcp", + "FromPort": 8000, + "ToPort": 8000, + "IpRanges": [ + { + "CidrIp": f"{ip_address}/32", + "Description": "Temporary access for vLLM testing", + } + ], + } + ], + ) + LOGGER.info("Ingress rule added successfully.") + except ClientError as e: + LOGGER.error(f"Failed to authorize ingress: {str(e)}") + raise + + +def setup_test_artifacts(ec2_client, instances, key_filename, region): + """ + Setup test artifacts on EC2 instances + """ + ec2_connections = {} + master_connection = None + worker_connection = None + + for instance in instances: + instance_id = instance["InstanceId"] + try: + instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])[ + "Reservations" + ][0]["Instances"][0] + public_ip = instance_details.get("PublicIpAddress") + + if not public_ip: + raise Exception(f"No public IP found for instance {instance_id}") + + connection = Connection( + host=public_ip, + user="ec2-user", + connect_kwargs={"key_filename": key_filename}, + ) + + # Test connection + connection.run('echo "Connection test"', hide=True) + ec2_connections[instance_id] = connection + + if not master_connection: + master_connection = connection + else: + worker_connection = connection + + print(f"Successfully connected to instance {instance_id}") + + except Exception as e: + print(f"Failed to connect to instance {instance_id}: {str(e)}") + raise + + artifact_folder = f"vllm-{TEST_ID}-folder" + s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder) + + def delete_s3_artifact_copy(): + test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location) + + try: + # Setup master instance + if master_connection: + master_connection.run(f"rm -rf {V2_INSTANCE_PATH}") + master_connection.run( + f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" + ) + print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") + master_connection.run( + f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" + ) + + if worker_connection: + worker_connection.run(f"rm -rf {V2_INSTANCE_PATH}") + worker_connection.run( + f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" + ) + print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") + worker_connection.run( + f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" + ) + + finally: + delete_s3_artifact_copy() + + if worker_connection: + return [master_connection, worker_connection] + return [master_connection] + + +def launch_regular_instances_with_retry( + ec2_client, + ec2_instance_type, + availability_zone_options, + ec2_run_instances_definition, +): + """ + Launch regular (non-EFA) EC2 instances with retry capability + """ + instances = None + error = None + + for a_zone in availability_zone_options: + ec2_run_instances_definition["Placement"] = {"AvailabilityZone": a_zone} + try: + instances = ec2_client.run_instances(**ec2_run_instances_definition)["Instances"] + if instances: + break + except ClientError as e: + LOGGER.error(f"Failed to launch in {a_zone} due to {e}") + error = e + continue + + if not instances: + raise error or Exception("Failed to launch instances in any availability zone") + + return instances + + +def efa_ec2_instances( + ec2_client, + ec2_instance_type, + ec2_instance_role_name, + ec2_key_name, + ec2_instance_ami, + region, + availability_zone_options, + is_arm64, +): + instances = None + key_filename = None + elastic_ip_allocation_ids = [] + is_efa = not is_arm64 + + try: + ec2_key_name = f"{ec2_key_name}-{TEST_ID}" + print(f"Creating instance: CI-CD {ec2_key_name}") + key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) + volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda" + + instance_name_prefix = f"CI-CD {ec2_key_name}" + ec2_run_instances_definition = { + "BlockDeviceMappings": [ + { + "DeviceName": volume_name, + "Ebs": { + "DeleteOnTermination": True, + "VolumeSize": 600, + "VolumeType": "gp3", + "Iops": 3000, + "Throughput": 125, + }, + }, + ], + "ImageId": ec2_instance_ami, + "InstanceType": ec2_instance_type, + "IamInstanceProfile": {"Name": ec2_instance_role_name}, + "KeyName": ec2_key_name, + "MaxCount": 2 if is_efa else 1, + "MinCount": 2 if is_efa else 1, + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [{"Key": "Name", "Value": instance_name_prefix}], + } + ], + } + + if is_efa: + instances = launch_efa_instances_with_retry( + ec2_client, + ec2_instance_type, + availability_zone_options, + ec2_run_instances_definition, + ) + else: + instances = launch_regular_instances_with_retry( + ec2_client, + ec2_instance_type, + availability_zone_options, + ec2_run_instances_definition, + ) + + master_instance_id = instances[0]["InstanceId"] + check_instance_state(master_instance_id, state="running", region=region) + check_system_state( + master_instance_id, system_status="ok", instance_status="ok", region=region + ) + print(f"Master instance {master_instance_id} is ready") + create_name_tags_for_instance(master_instance_id, f"{instance_name_prefix}_master", region) + if is_efa: + for i in range(1, len(instances)): + worker_instance_id = instances[i]["InstanceId"] + create_name_tags_for_instance( + worker_instance_id, f"{instance_name_prefix}_worker_{i}", region + ) + check_instance_state(worker_instance_id, state="running", region=region) + check_system_state( + worker_instance_id, system_status="ok", instance_status="ok", region=region + ) + print(f"Worker instance {worker_instance_id} is ready") + + num_efa_interfaces = get_num_efa_interfaces_for_instance_type( + ec2_instance_type, region=region + ) + + print(num_efa_interfaces) + + if num_efa_interfaces > 1: + for instance in instances: + try: + instance_id = instance["InstanceId"] + + network_interface_id = get_network_interface_id(instance_id, region) + elastic_ip_allocation_id = attach_elastic_ip( + network_interface_id, region, ENABLE_IPV6_TESTING + ) + elastic_ip_allocation_ids.append(elastic_ip_allocation_id) + except Exception as e: + if elastic_ip_allocation_ids: + delete_elastic_ips(elastic_ip_allocation_ids, ec2_client) + raise Exception(f"Error allocating elastic IP: {str(e)}") + + connections = setup_test_artifacts(ec2_client, instances, key_filename, region) + return_val = { + "instances": [ + (instance_info["InstanceId"], key_filename) for instance_info in instances + ], + "elastic_ips": elastic_ip_allocation_ids, + "connections": connections, + } + print("Launched EFA Test instances") + return return_val + + except Exception as e: + print(f"Error in efa_ec2_instances: {str(e)}") + # Clean up elastic IPs + if elastic_ip_allocation_ids: + try: + delete_elastic_ips(elastic_ip_allocation_ids, ec2_client) + except Exception as cleanup_error: + print(f"Error cleaning up elastic IPs: {str(cleanup_error)}") + + # Clean up instances + if instances: + try: + instance_ids = [instance["InstanceId"] for instance in instances] + ec2_client.terminate_instances(InstanceIds=instance_ids) + # Wait for instances to terminate + waiter = ec2_client.get_waiter("instance_terminated") + waiter.wait(InstanceIds=instance_ids) + except Exception as cleanup_error: + print(f"Error terminating instances: {str(cleanup_error)}") + + # Clean up key pair + if key_filename: + try: + if os.path.exists(key_filename): + os.remove(key_filename) + if os.path.exists(f"{key_filename}.pub"): + os.remove(f"{key_filename}.pub") + except Exception as cleanup_error: + print(f"Error cleaning up key files: {str(cleanup_error)}") + + raise + + +def _setup_instance(connection, fsx_dns_name, mount_name): + """ + Setup FSx mount and VLLM environment on an instance synchronously + """ + os.chdir("..") + # Copy script to instance + connection.put("vllm/ec2/utils/setup_fsx_vllm.sh", "/home/ec2-user/setup_fsx_vllm.sh") + + # Make script executable and run it + commands = [ + "chmod +x /home/ec2-user/setup_fsx_vllm.sh", + f"/home/ec2-user/setup_fsx_vllm.sh {fsx_dns_name} {mount_name}", + ] + + # Execute commands synchronously + result = connection.run("; ".join(commands)) + return result + + +def cleanup_resources(ec2_cli, resources, fsx): + """Cleanup all resources in reverse order of creation""" + cleanup_errors = [] + + def wait_for_instances(instance_ids): + waiter = ec2_cli.get_waiter("instance_terminated") + try: + waiter.wait(InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 100}) + return True + except WaiterError as e: + print(f"Warning: Instance termination waiter timed out: {str(e)}") + return False + + if resources.get("elastic_ips"): + try: + delete_elastic_ips(resources["elastic_ips"], ec2_cli) + print(f"Deleted elastic IPs: {resources['elastic_ips']}") + except Exception as e: + cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}") + + if resources.get("instances_info"): + try: + instance_ids = [instance_id for instance_id, _ in resources["instances_info"]] + ec2_cli.terminate_instances(InstanceIds=instance_ids) + print(f"Terminating instances: {instance_ids}") + + if not wait_for_instances(instance_ids): + cleanup_errors.append("Instances did not terminate within expected timeframe") + + for _, key_filename in resources["instances_info"]: + if key_filename: + try: + ec2_cli.delete_key_pair(KeyName=key_filename) + for ext in ["", ".pub"]: + file_path = f"{key_filename}{ext}" + if os.path.exists(file_path): + os.remove(file_path) + except Exception as e: + cleanup_errors.append(f"Failed to delete key file: {str(e)}") + except Exception as e: + cleanup_errors.append(f"Failed to cleanup EC2 resources: {str(e)}") + + if resources.get("fsx_config"): + try: + fsx.delete_fsx_filesystem(resources["fsx_config"]["filesystem_id"]) + print(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}") + except Exception as e: + cleanup_errors.append(f"Failed to delete FSx filesystem: {str(e)}") + + time.sleep(30) + + if resources.get("sg_fsx"): + max_attempts = 10 + for attempt in range(max_attempts): + try: + ec2_cli.delete_security_group(GroupId=resources["sg_fsx"]) + print(f"Deleted security group: {resources['sg_fsx']}") + break + except Exception as e: + if attempt == max_attempts - 1: + cleanup_errors.append( + f"Failed to delete security group after {max_attempts} attempts: {str(e)}" + ) + else: + print(f"Retry {attempt + 1}/{max_attempts} to delete security group") + time.sleep(30) + + if cleanup_errors: + raise Exception("Cleanup errors occurred:\n" + "\n".join(cleanup_errors)) + + +def launch_ec2_instances(ec2_cli, image): + """Launch EC2 instances with EFA support""" + instance_type = ec2_instance_type(image) + ami_id = ec2_instance_ami(DEFAULT_REGION, image) + az_options = availability_zone_options(ec2_cli, instance_type, DEFAULT_REGION) + is_arm64 = True if "arm64" in image else False + + instances_info = efa_ec2_instances( + ec2_client=ec2_cli, + ec2_instance_type=instance_type, + ec2_instance_role_name=EC2_INSTANCE_ROLE_NAME, + ec2_key_name="vllm-ec2-test", + ec2_instance_ami=ami_id, + region=DEFAULT_REGION, + availability_zone_options=az_options, + is_arm64=is_arm64, + ) + print(f"Launched instances: {instances_info}") + return instances_info + + +def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info): + """ + Configure security groups for FSx and EC2 instances + + Args: + ec2_cli: boto3 EC2 client + fsx: FsxSetup instance + vpc_id: VPC ID where security group will be created + instances_info: List of tuples containing (instance_id, key_filename) + + Returns: + str: FSx security group ID + """ + try: + fsx_name = f"fsx-lustre-vllm-ec2-test-sg-{instance_id}-{TEST_ID}" + # Create FSx security group + sg_fsx = fsx.create_fsx_security_group( + ec2_cli, + vpc_id, + fsx_name, + "Security group for FSx Lustre VLLM EC2 Tests", + ) + print(f"Created FSx security group: {sg_fsx}") + + # Get instance IDs from instances_info + instance_ids = [instance_id for instance_id, _ in instances_info] + + # Add security group rules + fsx.add_ingress_rules_sg(ec2_cli, sg_fsx, instance_ids) + + return sg_fsx + + except Exception as e: + print(f"Error configuring security groups: {str(e)}") + raise + + +def setup_instance(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name): + """Setup FSx mount on a single instance""" + instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][ + "Instances" + ][0] + public_ip = instance_details.get("PublicIpAddress") + + if not public_ip: + raise Exception(f"No public IP found for instance {instance_id}") + + connection = Connection( + host=public_ip, + user="ec2-user", + connect_kwargs={"key_filename": key_filename}, + ) + + return _setup_instance(connection, fsx_dns_name, mount_name) + + +def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name): + """Mount FSx on worker instance without running setup script""" + instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][ + "Instances" + ][0] + public_ip = instance_details.get("PublicIpAddress") + + if not public_ip: + raise Exception(f"No public IP found for instance {instance_id}") + + connection = Connection( + host=public_ip, + user="ec2-user", + connect_kwargs={"key_filename": key_filename}, + ) + + commands = [ + "sudo yum install -y lustre-client", + "sudo mkdir -p /fsx", + f"sudo mount -t lustre -o relatime,flock {fsx_dns_name}@tcp:/{mount_name} /fsx", + ] + + for cmd in commands: + connection.run(cmd) + + +def setup(image): + """Main setup function for VLLM on EC2 with FSx""" + print("Testing vllm on ec2........") + fsx = FsxSetup(DEFAULT_REGION) + ec2_cli = get_ec2_client(DEFAULT_REGION) + resources = {"instances_info": None, "fsx_config": None, "sg_fsx": None} + + try: + vpc_id = get_default_vpc_id(ec2_cli) + subnet_ids = get_subnet_id_by_vpc(ec2_cli, vpc_id) + + instance_result = launch_ec2_instances(ec2_cli, image) + resources["instances_info"] = instance_result["instances"] + resources["elastic_ips"] = instance_result["elastic_ips"] + resources["connections"] = instance_result["connections"] + print("Waiting 60 seconds for instances to initialize...") + time.sleep(60) + + instance_ids = [instance_id for instance_id, _ in resources["instances_info"]] + resources["sg_fsx"] = configure_security_groups( + instance_ids[0], ec2_cli, fsx, vpc_id, resources["instances_info"] + ) + + # Create FSx filesystem + resources["fsx_config"] = fsx.create_fsx_filesystem( + subnet_ids[0], + [resources["sg_fsx"]], + 1200, + "SCRATCH_2", + {"Name": f"fsx-lustre-vllm-ec2-test-{instance_ids[0]}-{TEST_ID}"}, + ) + print("Created FSx filesystem") + + master_instance_id, master_key_filename = resources["instances_info"][0] + setup_instance( + master_instance_id, + master_key_filename, + ec2_cli, + resources["fsx_config"]["dns_name"], + resources["fsx_config"]["mount_name"], + ) + print(f"Setup completed for master instance {master_instance_id}") + + if len(resources["instances_info"]) > 1: + worker_instance_id, worker_key_filename = resources["instances_info"][1] + mount_fsx_on_worker( + worker_instance_id, + worker_key_filename, + ec2_cli, + resources["fsx_config"]["dns_name"], + resources["fsx_config"]["mount_name"], + ) + print(f"FSx mounted on worker instance {worker_instance_id}") + + return resources + + except Exception as e: + print(f"Error during setup: {str(e)}") + cleanup_resources(ec2_cli, resources, fsx) + raise + + +if __name__ == "__main__": + setup() diff --git a/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh b/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh new file mode 100644 index 000000000000..685b1706c432 --- /dev/null +++ b/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# setup_fsx_vllm.sh +# Script to mount FSx and setup VLLM environment + + +# Get FSx DNS name from argument +FSX_DNS_NAME=$1 +MOUNT_NAME=$2 + +# Function to log messages with hostname +log() { + local HOSTNAME=$(hostname) + echo "[Host ${HOSTNAME}] $1" +} + + +# Function to check if command was successful +check_error() { + if [ $? -ne 0 ]; then + echo "Error: $1" + exit 1 + fi +} + +if [ -z "$FSX_DNS_NAME" ] || [ -z "$MOUNT_NAME" ]; then + echo "Usage: $0 " + exit 1 +fi + +# Install required packages +log "Installing required packages..." +sudo yum install -y nfs-utils git +check_error "Failed to install base packages" + + +# Install the latest Lustre client +log "Installing latest Lustre client..." +sudo yum install -y lustre-client +check_error "Failed to install Lustre client" + + +# Create FSx mount directory +log "Creating FSx mount directory..." +sudo mkdir -p /fsx +check_error "Failed to create /fsx directory" + + +# Modify mount command to include verbose output +sudo mount -t lustre -o relatime,flock ${FSX_DNS_NAME}@tcp:/${MOUNT_NAME} /fsx + +# Create VLLM directory in FSx +log "Creating VLLM directory..." +sudo mkdir -p /fsx/vllm-dlc + +check_error "Failed to create /fsx/vllm-dlc directory" + +# Set proper permissions +log "Setting proper permissions..." +sudo chown -R ec2-user:ec2-user /fsx/vllm-dlc +check_error "Failed to set permissions" + +cd /fsx/vllm-dlc +git clone https://github.com/vllm-project/vllm.git +cd vllm +git checkout tags/v0.10.2 + +# Download ShareGPT dataset +log "Downloading ShareGPT dataset..." +cd /fsx/vllm-dlc && wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +check_error "Failed to download ShareGPT dataset" + +log "Setup completed successfully!" + + \ No newline at end of file From a0c90df5bfd534c062c359291fee25f67496b45d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 12:32:22 -0700 Subject: [PATCH 04/33] add v2 path for EFA tests --- test/v2/ec2/efa/__init__.py | 1 + test/v2/ec2/efa/testEFA | 103 ++++++++ test/v2/ec2/efa/testEFASanity | 27 +++ test/v2/ec2/efa/test_efa.py | 437 ++++++++++++++++++++++++++++++++++ 4 files changed, 568 insertions(+) create mode 100644 test/v2/ec2/efa/__init__.py create mode 100644 test/v2/ec2/efa/testEFA create mode 100644 test/v2/ec2/efa/testEFASanity create mode 100644 test/v2/ec2/efa/test_efa.py diff --git a/test/v2/ec2/efa/__init__.py b/test/v2/ec2/efa/__init__.py new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/test/v2/ec2/efa/__init__.py @@ -0,0 +1 @@ + diff --git a/test/v2/ec2/efa/testEFA b/test/v2/ec2/efa/testEFA new file mode 100644 index 000000000000..4b676249d816 --- /dev/null +++ b/test/v2/ec2/efa/testEFA @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +set -ex + +NUM_HOSTS_FILE=$1 +NUM_HOSTS=$2 +IS_IPV6=$3 + +if [[ -z "${CUDA_HOME}" ]]; then + echo "CUDA_HOME variable is empty, please define it in dockerfile" + exit 1 +fi + +TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") +INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type) + +GPU_COUNT=$(nvidia-smi -L | wc -l) +NODES=$(($GPU_COUNT * $NUM_HOSTS)) + + +PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME) +TRAINING_LOG="/test/logs/testEFA.log" + +USE_DEVICE_RDMA_ARG="" + +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html +# g5.24xlarge we use in RC is not RDMA read supported +if [[ ${INSTANCE_TYPE} == p4d.24xlarge || ${INSTANCE_TYPE} == p4de.24xlarge || ${INSTANCE_TYPE} == p5.48xlarge ]]; then + USE_DEVICE_RDMA_ARG="-x FI_EFA_USE_DEVICE_RDMA=1" +fi + +validate_all_reduce_performance_logs(){ + grep "aws-ofi-nccl" ${TRAINING_LOG} || { echo "aws-ofi-nccl is not working, please check if it is installed correctly"; exit 1; } + grep -i "NET/OFI Selected [Pp]rovider is efa" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; } + # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric" + grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; } + if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then + grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG} + # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA + grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG} + fi +} + +check_efa_nccl_all_reduce_performance(){ + benchmark=$(cat $TRAINING_LOG | grep '1073741824' | tail -n1 | awk -F " " '{{print $11}}' | sed 's/ //' | sed 's/ 5e-07//') + echo "Benchmark throughput: ${benchmark}" + if [[ -z "${benchmark}" ]]; then + echo "benchmark variable is empty" + exit 1 + fi + + # The standard throughput should be at least 41 for 2 p4d with 4 EFA devices + # However, if the 2 instances are not in the same A-Z in the same region, performance can decrease. + # To account for this we need to modify thresholds dynamically based on where instances are. + # Temporarily setting these to be < 50% of optimal until AWS OFI NCCL team has concrete numbers for this. + PERFORMANCE_THRESHOLD="3" + + if [[ $(echo "$benchmark $PERFORMANCE_THRESHOLD" | awk '{print ($1 >= $2)}') == 1 ]]; then + echo "***************************** check_efa_nccl_all_reduce_performance passed *****************************" + else + echo "***************************** check_efa_nccl_all_reduce_performance failed *****************************" + exit 1 + fi +} + +check_efa_nccl_all_reduce(){ + echo "Running all_reduce_perf test" + + if [[ ${IS_IPV6} == "True" ]]; then + echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6" + mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ + -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ + -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_FAMILY=AF_INET6 \ + /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" + else + echo "Running all_reduce_perf test with IPv4: using default IPv4 mode" + # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into + # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch + # versions in DLC images. + mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ + -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ + -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" + fi + + RETURN_VAL=${PIPESTATUS[0]} + # In case, if you would like see logs, uncomment below line + # RESULT=$(cat ${TRAINING_LOG}) + + if [ ${RETURN_VAL} -eq 0 ]; then + echo "***************************** check_efa_nccl_all_reduce passed *****************************" + else + echo "***************************** check_efa_nccl_all_reduce failed *****************************" + fi + validate_all_reduce_performance_logs + check_efa_nccl_all_reduce_performance +} + +check_efa_nccl_all_reduce diff --git a/test/v2/ec2/efa/testEFASanity b/test/v2/ec2/efa/testEFASanity new file mode 100644 index 000000000000..1f350628c668 --- /dev/null +++ b/test/v2/ec2/efa/testEFASanity @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +set -ex + +export PATH=/opt/amazon/efa/bin:$PATH + +# check if efa components are correctly installed +cat /opt/amazon/efa_installed_packages + +# check Libfabric EFA interfaces +fi_info -p efa +fi_info -p efa -t FI_EP_RDM | grep 'FI_EP_RDM' + +apt-get update && apt-get install -y kmod + +# check if ib_uverbs is present +lsmod | grep ib_uverbs + +# ensure that the security group created is configured correctly +/test/bin/efa/efa_test.sh + +# Queries local RDMA devices +ibv_devinfo + +# check if gdr device is loaded +cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA' diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py new file mode 100644 index 000000000000..dded29bdffcc --- /dev/null +++ b/test/v2/ec2/efa/test_efa.py @@ -0,0 +1,437 @@ +import os + +import pytest + +import test.test_utils.ec2 as ec2_utils +from test.test_utils import ( + CONTAINER_TESTS_PREFIX_V2, + get_account_id_from_image_uri, + get_region_from_image_uri, + is_pr_context, + is_efa_dedicated, + are_heavy_instance_ec2_tests_enabled, + login_to_ecr_registry, + run_cmd_on_container, +) +from packaging.version import Version +from packaging.specifiers import SpecifierSet + +from infra.test_infra.ec2.utils import ( + get_efa_ec2_instance_type, + filter_efa_instance_type, + filter_efa_only_p4_instance_type, +) + +BUILD_ALL_REDUCE_PERF_CMD = os.path.join( + CONTAINER_TESTS_PREFIX_V2, "efa", "build_all_reduce_perf.sh" +) +EFA_SANITY_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX_V2, "efa", "testEFASanity") +EFA_INTEGRATION_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX_V2, "efa", "testEFA") +EFA_PYTORCH_HEALTHCHECK_TEST_CMD = os.path.join( + CONTAINER_TESTS_PREFIX_V2, "healthcheck_tests", "efa_checker_single_node.sh" +) + +ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + +MASTER_SSH_KEY_NAME = "master_id_rsa" +WORKER_SSH_KEY_NAME = "worker_id_rsa" +MASTER_CONTAINER_NAME = "master_container" +WORKER_CONTAINER_NAME = "worker_container" +HOSTS_FILE_LOCATION = "/root/hosts" + +DEFAULT_EFA_TIMEOUT = 300 + + +def get_vllm_container_name(test_scenario, arch_type, node_role=None): + """ + Generate unique container name for vLLM v2 EC2 tests + + Args: + test_scenario: Test scenario (e.g., "efa", "single-node") + arch_type: Architecture from buildspec (e.g., "x86_64", "arm64") + node_role: For multi-node: "master", "worker-0", etc. (optional) + + Returns: + Container name like "vllm-ec2-efa-x86_64-master" or "vllm-ec2-single-node-arm64" + """ + base_name = f"vllm-ec2-{test_scenario}-{arch_type}" + return f"{base_name}-{node_role}" if node_role else base_name + + +EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type( + default="p4d.24xlarge", + filter_function=filter_efa_instance_type, +) + +EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type( + default="p4d.24xlarge", + filter_function=filter_efa_only_p4_instance_type, +) + + +# TODO: decide on whether to keep this commented out or left out until actual implementation of each framework +# def test_pytorch_efa( +# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only +# ): +# """ +# Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA +# on multiple nodes using DLC images. The test scripts are agnostic to the framework and version +# installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf +# binary necessary for multinode tests, on each node. +# Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances +# on pipelines. +# :param pytorch_training: str PyTorch Training DLC image URI +# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances +# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances +# :param ec2_instance_type: str Instance Type being tested +# :param region: str Region in which EFA-enabled instances are launched +# :param gpu_only: pytest fixture to limit test only to GPU DLCs +# """ +# number_of_nodes = 2 +# _setup_multinode_efa_instances( +# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region +# ) +# master_connection = efa_ec2_connections[0] +# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) + +# ipv6_arg = "True" if ENABLE_IPV6_TESTING else "" + +# run_cmd_on_container( +# MASTER_CONTAINER_NAME, +# master_connection, +# f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}", +# hide=False, +# timeout=DEFAULT_EFA_TIMEOUT, +# ) + + +# def test_efa_tensorflow( +# tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only +# ): +# """ +# Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA +# on multiple nodes using DLC images. The test scripts are agnostic to the framework and version +# installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf +# binary necessary for multinode tests, on each node. +# Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances +# on pipelines. +# :param tensorflow_training: str PyTorch Training DLC image URI +# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances +# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances +# :param ec2_instance_type: str Instance Type being tested +# :param region: str Region in which EFA-enabled instances are launched +# :param gpu_only: pytest fixture to limit test only to GPU DLCs +# """ +# number_of_nodes = 2 +# _setup_multinode_efa_instances( +# tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region +# ) +# master_connection = efa_ec2_connections[0] +# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) + +# # pass IPv6 flag if enabled +# ipv6_arg = "True" if ENABLE_IPV6_TESTING else "" + +# run_cmd_on_container( +# MASTER_CONTAINER_NAME, +# master_connection, +# f"export CUDA_HOME='/usr/local/cuda'; {EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}", +# hide=False, +# timeout=DEFAULT_EFA_TIMEOUT, +# ) + + +# def test_pytorch_efa_healthcheck( +# pytorch_training, +# efa_ec2_instances, +# efa_ec2_connections, +# ec2_instance_type, +# region, +# gpu_only, +# ): +# """ +# Run EFA Health Check tests on DLC. +# :param pytorch_training: str PyTorch Training DLC image URI +# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances +# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances +# :param ec2_instance_type: str Instance Type being tested +# :param region: str Region in which EFA-enabled instances are launched +# :param gpu_only: pytest fixture to limit test only to GPU DLCs +# """ +# _setup_multinode_efa_instances( +# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region +# ) +# master_connection = efa_ec2_connections[0] +# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) +# run_cmd_on_container( +# MASTER_CONTAINER_NAME, +# master_connection, +# f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", +# hide=False, +# timeout=DEFAULT_EFA_TIMEOUT, +# ) + + +def _setup_multinode_efa_instances( + image, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, arch_type=None +): + """ + Pull and start test image containers on both master and worker instances, configure + password-less SSH between master and worker nodes, and build all_reduce_perf binary on + master and worker nodes. + :param image: str DLC image URI to be tested + :param efa_ec2_instances: list of tuples of instance_id, keypair_filepath for each instance + :param efa_ec2_connections: list of fabric connection objects + :param ec2_instance_type: str instance type being used + :param region: str region name in which test is being run + :param arch_type: str architecture type (e.g., "x86_64", "arm64") + """ + # Asynchronously pull docker image on all instances + _pull_image_on_all_instances(efa_ec2_connections, image) + # Configure master node container + master_connection = efa_ec2_connections[0] + + # Determine container names - use unique names for vLLM, standard names for others + if "vllm" in image: + # Use provided arch_type or infer from image as fallback + if arch_type is None: + arch_type = "arm64" if "arm64" in image else "x86_64" + master_container_name = get_vllm_container_name("efa", arch_type, "master") + else: + master_container_name = MASTER_CONTAINER_NAME + + build_all_reduce_perf_promises = [] + # Run container + _setup_container(master_connection, image, master_container_name) + # Build all_reduce_perf binary using nccl-tests + promise = run_cmd_on_container( + master_container_name, + master_connection, + BUILD_ALL_REDUCE_PERF_CMD, + timeout=DEFAULT_EFA_TIMEOUT, + asynchronous=True, + ) + build_all_reduce_perf_promises.append(promise) + + for idx, worker_connection in enumerate(efa_ec2_connections[1:]): + # Determine worker container name + if "vllm" in image: + worker_container_name = get_vllm_container_name("efa", arch_type, f"worker-{idx}") + else: + worker_container_name = WORKER_CONTAINER_NAME + + # Run container + _setup_container(worker_connection, image, worker_container_name) + # Build all_reduce_perf binary using nccl-tests + promise = run_cmd_on_container( + worker_container_name, + worker_connection, + BUILD_ALL_REDUCE_PERF_CMD, + timeout=DEFAULT_EFA_TIMEOUT, + asynchronous=True, + ) + build_all_reduce_perf_promises.append(promise) + + # Configure master node SSH client-side configurations + _setup_master_efa_ssh_config(master_connection) + # Create a hosts file that provides mpi with IP addresses and no. of GPUs in each node + worker_instance_ids = [instance_id for instance_id, _ in efa_ec2_instances[1:]] + _create_master_mpi_hosts_file( + efa_ec2_connections, worker_instance_ids, ec2_instance_type, region + ) + # Obtain master node SSH public key for future use + master_pub_key = run_cmd_on_container( + MASTER_CONTAINER_NAME, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub" + ).stdout.strip("\n") + + # Configure worker node containers + for worker_connection in efa_ec2_connections[1:]: + # Configure worker node SSH server-side configurations, launch SSH daemon, and allow + # password-less SSH access from master to worker nodes. + _setup_worker_efa_ssh_config(worker_connection, master_pub_key) + + # Wait for all_reduce_perf binaries to be built in all containers + for promise in build_all_reduce_perf_promises: + promise.join() + + +def _pull_image_on_all_instances(connections, docker_image): + """ + Asynchronously pull tested image on all instances + :param connections: list of Fabric Connection objects + :param docker_image: str DLC image URI to be tested + """ + account_id = get_account_id_from_image_uri(docker_image) + region = get_region_from_image_uri(docker_image) + + for conn in connections: + login_to_ecr_registry(conn, account_id, region) + + promises = [conn.run(f"docker pull {docker_image}", asynchronous=True) for conn in connections] + for prom in promises: + prom.join() + + +def _setup_container(connection, docker_image, container_name): + """ + Pull and run tested image with all EFA devices made available to container + :param connection: Fabric Connection object + :param docker_image: str DLC image URI to be tested + :param container_name: str container name + """ + devices = ec2_utils.get_efa_devices_on_instance(connection) + docker_devices_args = [f"--device {device_location}" for device_location in devices] + docker_all_devices_arg = " ".join(docker_devices_args) + + # Remove pre-existing containers if reusing an instance + connection.run(f"docker rm -f {container_name}", hide=True) + + # Use network mode host, rather than the default "bridge" to allow direct access to container + # using SSH on a pre-defined port (as decided by sshd_config on server-side). + # Allow instance to share all memory with container using memlock=-1:-1. + # Share all EFA devices with container using --device for all EFA devices. + if "vllm" in docker_image: + connection.run( + f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " + f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image}" + ) + else: + connection.run( + f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " + f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image} bash" + ) + + +def _setup_master_efa_ssh_config(connection): + """ + Set up SSH client config on master container to connect to worker + :param connection: Fabric Connection object + """ + run_cmd_on_container( + MASTER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*" + ) + # When running container in --network=host, the container hostname changes, requiring + # a new SSH key to be generated. + run_cmd_on_container( + MASTER_CONTAINER_NAME, + connection, + f"""ssh-keygen -t rsa -f $HOME/.ssh/{MASTER_SSH_KEY_NAME} -N "" """, + ) + # Configure SSH client-side to always use newly created key, and use port 2022, since this is + # the port configured in the worker node SSH daemon. + master_container_ssh_config = ( + "Host *\n" + f" IdentityFile /root/.ssh/{MASTER_SSH_KEY_NAME}\n" + " StrictHostKeyChecking no\n" + " UserKnownHostsFile /dev/null\n" + " Port 2022" + ) + run_cmd_on_container( + MASTER_CONTAINER_NAME, + connection, + f"""echo -e "{master_container_ssh_config}" > $HOME/.ssh/config""", + ) + run_cmd_on_container(MASTER_CONTAINER_NAME, connection, "chmod -R 600 $HOME/.ssh/*") + + +def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region): + """ + Create MPI Hosts file that contains private IP addresses of all hosts used in training job. + :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections] + :param worker_instance_ids: list of str worker instance IDs + :param instance_type: str EC2 Instance Type being used + :param region: str region name in which test is run + """ + master_connection = efa_ec2_connections[0] + slots = ec2_utils.get_instance_num_gpus(instance_type=instance_type) + worker_instance_private_ips = [ + ec2_utils.get_private_ip(instance_id, region) for instance_id in worker_instance_ids + ] + + if ENABLE_IPV6_TESTING: + master_ip = master_connection.ipv6_address + if not master_ip: + raise RuntimeError("IPv6 testing enabled but no IPv6 address found for master node") + + worker_ips = [conn.ipv6_address for conn in efa_ec2_connections[1:]] + if not all(worker_ips): + raise RuntimeError("IPv6 testing enabled but not all workers have IPv6 addresses") + + hosts_string = f"compute1 slots={slots} " + etc_string = f"{master_ip} compute1" + compute_counter = 2 + + for worker_ip in worker_ips: + compute_name = f"compute{compute_counter}" + hosts_string += f"\n{compute_name} slots={slots} " + etc_string += f"\n{worker_ip} {compute_name}" + compute_counter += 1 + + run_cmd_on_container( + MASTER_CONTAINER_NAME, master_connection, f"""echo "{etc_string}" > /etc/hosts""" + ) + + run_cmd_on_container( + MASTER_CONTAINER_NAME, + master_connection, + f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", + ) + else: + # Configure MPI hosts file with IP addresses and slots for worker nodes + hosts_string = f"localhost slots={slots} " + for worker_ip in worker_instance_private_ips: + hosts_string += f"\n{worker_ip} slots={slots} " + + run_cmd_on_container( + MASTER_CONTAINER_NAME, + master_connection, + f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", + ) + + +def _setup_worker_efa_ssh_config(connection, master_pub_key): + """ + Set up SSH server config on worker container to allow connections from master. + :param connection: Fabric Connection object + :param master_pub_key: str Master node public SSH key to allow password-less SSH access + """ + # Force SSH Daemon to use port 2022, since port 22 is already in use by the host instance + run_cmd_on_container( + WORKER_CONTAINER_NAME, connection, """echo "Port 2022" >> /etc/ssh/sshd_config""" + ) + run_cmd_on_container( + WORKER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*" + ) + # When running container in --network=host, the container hostname changes, requiring + # a new SSH key to be generated. + run_cmd_on_container( + WORKER_CONTAINER_NAME, + connection, + f"""ssh-keygen -t rsa -f $HOME/.ssh/{WORKER_SSH_KEY_NAME} -N "" """, + ) + # Add both self and master public keys to authorized keys to allow password-less access to + # this container from authorized hosts. + run_cmd_on_container( + WORKER_CONTAINER_NAME, + connection, + f"cp $HOME/.ssh/{WORKER_SSH_KEY_NAME}.pub $HOME/.ssh/authorized_keys", + ) + run_cmd_on_container( + WORKER_CONTAINER_NAME, + connection, + f"""echo "{master_pub_key}" >> $HOME/.ssh/authorized_keys""", + ) + # Check if ssh agent is running or not, and if not, run it. + run_cmd_on_container( + WORKER_CONTAINER_NAME, + connection, + f"eval `ssh-agent -s` && ssh-add $HOME/.ssh/{WORKER_SSH_KEY_NAME}", + ) + # Start SSH service which uses configurations from /etc/ssh/sshd_config + run_cmd_on_container(WORKER_CONTAINER_NAME, connection, "service ssh start") + # Check status of SSH service, and fail test-setup if service doesn't run correctly. + ssh_status = run_cmd_on_container( + WORKER_CONTAINER_NAME, connection, "service ssh status", warn=True + ) + if ssh_status.failed: + raise RuntimeError("Failed to setup SSH Daemon on worker node") From 1783103b409f34d593ee11a9690919396bf03767 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 12:33:53 -0700 Subject: [PATCH 05/33] add v2 vllm EC2 tests --- test/v2/ec2/vllm/__init__.py | 0 test/v2/ec2/vllm/head_node_setup.sh | 37 ++ .../vllm/run_vllm_benchmark_single_node.sh | 52 +++ test/v2/ec2/vllm/run_vllm_on_arm64.sh | 113 +++++ test/v2/ec2/vllm/test_ec2.py | 423 ++++++++++++++++++ test/v2/ec2/vllm/worker_node_setup.sh | 25 ++ 6 files changed, 650 insertions(+) create mode 100644 test/v2/ec2/vllm/__init__.py create mode 100644 test/v2/ec2/vllm/head_node_setup.sh create mode 100644 test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh create mode 100644 test/v2/ec2/vllm/run_vllm_on_arm64.sh create mode 100644 test/v2/ec2/vllm/test_ec2.py create mode 100644 test/v2/ec2/vllm/worker_node_setup.sh diff --git a/test/v2/ec2/vllm/__init__.py b/test/v2/ec2/vllm/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/v2/ec2/vllm/head_node_setup.sh b/test/v2/ec2/vllm/head_node_setup.sh new file mode 100644 index 000000000000..1c7bd4751824 --- /dev/null +++ b/test/v2/ec2/vllm/head_node_setup.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Usage: ./head_node_setup.sh +set -e + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +IMAGE_URI=$1 +HF_TOKEN=$2 +HEAD_IP=$3 +CONTAINER_NAME=$4 + +log "Starting head node setup..." +log "Image URI: $IMAGE_URI" +log "Head IP: $HEAD_IP" + +# Start head node in tmux session and capture container ID +tmux new-session -d -s ray_head "docker run \ + --entrypoint /bin/bash \ + --network host \ + --name $CONTAINER_NAME \ + --shm-size 10.24g \ + --gpus all \ + -v /fsx/.cache/huggingface:/root/.cache/huggingface \ + -e VLLM_HOST_IP=$HEAD_IP \ + -e HF_TOKEN=$HF_TOKEN \ + -e FI_PROVIDER=efa \ + -e FI_EFA_USE_DEVICE_RDMA=1 \ + --device=/dev/infiniband/ \ + --ulimit memlock=-1:-1 \ + -p 8000:8000 \ + $IMAGE_URI -c 'ray start --block --head --port=6379'" + +log "Head node started in container: ${CONTAINER_NAME}" + diff --git a/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh b/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh new file mode 100644 index 000000000000..1d6c7c0ef999 --- /dev/null +++ b/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +DLC_IMAGE=$1 +HF_TOKEN=$2 +MODEL_NAME=$3 + +# Run vLLM using Official Docker image from https://docs.vllm.ai/en/latest/deployment/docker.html +# Here is the https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile +tmux new-session -d -s single_node "docker run --runtime nvidia --gpus all \ + -v /fsx/.cache/huggingface:/root/.cache/huggingface \ + -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -e "NCCL_DEBUG=TRACE" \ + -p 8000:8000 \ + --ipc=host \ + $DLC_IMAGE \ + --model $MODEL_NAME \ + --tensor-parallel-size 8" + +sleep 1500 + +source vllm_env/bin/activate + +# Example - Online Benchmark: https://github.com/vllm-project/vllm/tree/main/benchmarks#example---online-benchmark +python3 /fsx/vllm-dlc/vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL_NAME \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path /fsx/vllm-dlc/ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 1000 + +# ============ Serving Benchmark Result ============ +# Successful requests: 1000 +# Benchmark duration (s): 82.67 +# Total input tokens: 215196 +# Total generated tokens: 185671 +# Request throughput (req/s): 12.10 +# Output token throughput (tok/s): 2245.92 +# Total Token throughput (tok/s): 4848.99 +# ---------------Time to First Token---------------- +# Mean TTFT (ms): 25037.89 +# Median TTFT (ms): 22099.12 +# P99 TTFT (ms): 58100.87 +# -----Time per Output Token (excl. 1st token)------ +# Mean TPOT (ms): 98.10 +# Median TPOT (ms): 92.09 +# P99 TPOT (ms): 256.34 +# ---------------Inter-token Latency---------------- +# Mean ITL (ms): 84.56 +# Median ITL (ms): 63.78 +# P99 ITL (ms): 253.97 +# ================================================== diff --git a/test/v2/ec2/vllm/run_vllm_on_arm64.sh b/test/v2/ec2/vllm/run_vllm_on_arm64.sh new file mode 100644 index 000000000000..d59ecc62f4aa --- /dev/null +++ b/test/v2/ec2/vllm/run_vllm_on_arm64.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -e + +DLC_IMAGE=$1 +HF_TOKEN=$2 + +if [ -z "$DLC_IMAGE" ] || [ -z "$HF_TOKEN" ]; then + echo "Usage: $0 " + exit 1 +fi + +MODEL_NAME="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" +CONTAINER_NAME="vllm-arm64-dlc" +PORT=8000 + +wait_for_api() { + local max_attempts=60 + local attempt=1 + + echo "Waiting for VLLM API to be ready..." + while ! curl -s http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "prompt": "What is vllm?", + "max_tokens": 30 + }' > /dev/null; do + if [ $attempt -ge $max_attempts ]; then + echo "Error: API failed to start after $max_attempts attempts" + docker logs ${CONTAINER_NAME} + exit 1 + fi + sleep 5 + ((attempt++)) + done + echo "API is ready!" +} + +cleanup() { + echo "Cleaning up containers..." + docker stop ${CONTAINER_NAME} 2>/dev/null || true + docker rm ${CONTAINER_NAME} 2>/dev/null || true +} + +handle_error() { + echo "Error occurred on line $1" + cleanup + exit 1 +} + +trap cleanup EXIT +trap 'handle_error $LINENO' ERR + +echo "####################### RUNNING INFERENCE CHECK ########################################" + +docker run --rm \ + -v /fsx/vllm-dlc/vllm:/vllm \ + --entrypoint /bin/bash \ + -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -e "VLLM_WORKER_MULTIPROC_METHOD=spawn" \ + -e "VLLM_USE_V1=0" \ + -v /fsx/.cache/huggingface:/root/.cache/huggingface \ + --gpus=all \ + $DLC_IMAGE \ + -c "python3 /vllm/examples/offline_inference/basic/generate.py \ + --model ${MODEL_NAME} \ + --dtype float16 \ + --tensor-parallel-size 1 \ + --max-model-len 2048" + +echo "####################### Starting VLLM server ##########################################" + +docker run -d \ + -v /fsx/vllm-dlc/vllm:/vllm \ + --name ${CONTAINER_NAME} \ + -p ${PORT}:8000 \ + --entrypoint /bin/bash \ + -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -e "VLLM_WORKER_MULTIPROC_METHOD=spawn" \ + -e "VLLM_USE_V1=0" \ + -v /fsx/.cache/huggingface:/root/.cache/huggingface \ + --gpus=all \ + $DLC_IMAGE \ + -c "vllm serve ${MODEL_NAME} \ + --dtype float16 \ + --gpu-memory-utilization 0.7 \ + --max-model-len 6000 \ + --enforce-eager \ + --reasoning-parser deepseek_r1" + +wait_for_api +docker logs "${CONTAINER_NAME}" + +echo "####################### API TESTING ###########################" + +curl -s http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "prompt": "What is AWS Deep learning container?", + "max_tokens": 50 + }' + +echo "####################### TESTING TOOL CALLS (OPEN AI API) ###########################" + +python -m venv .venv +source .venv/bin/activate + +pip install "openai>=1.0.0" +python3 /fsx/vllm-dlc/vllm/examples/online_serving/openai_chat_completion_with_reasoning.py +deactivate + +echo "####################### Testing completed successfully ###########################" diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py new file mode 100644 index 000000000000..ba64ee3fcc25 --- /dev/null +++ b/test/v2/ec2/vllm/test_ec2.py @@ -0,0 +1,423 @@ +import threading +import boto3 +import time, json +from botocore.exceptions import ClientError +from fabric import Connection + +from infra.test_infra.ec2.utils import ( + get_account_id_from_image_uri, + login_to_ecr_registry, + get_ec2_client, + install_python_in_instance, +) + +from infra.test_infra.test_infra_utils import create_logger +from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup +from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources, TEST_ID +from test.v2.ec2.efa.test_efa import ( + _setup_multinode_efa_instances, + EFA_SANITY_TEST_CMD, + MASTER_CONTAINER_NAME, + HOSTS_FILE_LOCATION, + EFA_INTEGRATION_TEST_CMD, + DEFAULT_EFA_TIMEOUT, + get_vllm_container_name, +) +from test.test_utils import run_cmd_on_container + +MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" +DEFAULT_REGION = "us-west-2" +LOGGER = create_logger(__name__) + + +def setup_env(connection): + """Setup Python environment on a node""" + setup_command = """ + python3 -m venv vllm_env && \ + source vllm_env/bin/activate && \ + pip install --upgrade pip setuptools wheel && \ + pip install numpy torch tqdm aiohttp pandas datasets pillow ray vllm==0.10.0 && \ + pip install "transformers<4.54.0" + """ + connection.run(setup_command, shell=True) + + +def create_benchmark_command() -> str: + """Create command for running benchmark""" + return f""" + vllm bench serve \ + --model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ + --backend vllm \ + --base-url "http://localhost:8000" \ + --endpoint '/v1/completions' \ + --dataset-name sharegpt \ + --dataset-path /fsx/vllm-dlc/ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 1000 + """ + + +def get_secret_hf_token(): + secret_name = "test/hf_token" + region_name = "us-west-2" + + session = boto3.session.Session() + client = session.client(service_name="secretsmanager", region_name=region_name) + try: + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + except ClientError as e: + raise e + + response = json.loads(get_secret_value_response["SecretString"]) + return response + + +def wait_for_container_ready(connection, container_name, timeout: int = 1000) -> bool: + """ + Wait for container and model to be ready by checking logs and endpoint + Returns True if container and model are ready, False if timeout + """ + start_time = time.time() + model_ready = False + + while time.time() - start_time < timeout: + if not model_ready: + try: + curl_cmd = """ + curl -s http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "prompt": "Hello", + "max_tokens": 10 + }' + """ + result = connection.run(curl_cmd, hide=False) + if result.ok: + print("Model endpoint is responding") + print("\n=== Complete vLLM Server Log ===") + connection.run(f"docker exec {container_name} cat vllm.log", hide=False) + print("=== End of Log ===\n") + model_ready = True + return True + except Exception: + pass + return False + + +def setup_docker_image(conn, image_uri): + account_id = get_account_id_from_image_uri(image_uri) + login_to_ecr_registry(conn, account_id, DEFAULT_REGION) + print(f"Pulling image: {image_uri}") + conn.run(f"docker pull {image_uri}", hide="out") + + +def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_uri): + try: + # Get HF token + response = get_secret_hf_token() + hf_token = response.get("HF_TOKEN") + if not hf_token: + raise Exception("Failed to get HF token") + + for conn in [head_connection, worker_connection]: + install_python_in_instance(conn, "3.10") + setup_docker_image(conn, image_uri) + setup_env(conn) + + head_connection.put("v2/ec2/vllm/head_node_setup.sh", "/home/ec2-user/head_node_setup.sh") + worker_connection.put( + "v2/ec2/vllm/worker_node_setup.sh", "/home/ec2-user/worker_node_setup.sh" + ) + + head_connection.run("chmod +x head_node_setup.sh") + worker_connection.run("chmod +x worker_node_setup.sh") + + head_ip = head_connection.run("hostname -i").stdout.strip() + worker_ip = worker_connection.run("hostname -i").stdout.strip() + + container_name = "ray_head-" + TEST_ID + print("Starting head node...") + head_connection.run( + f"./head_node_setup.sh {image_uri} {hf_token} {head_ip} {container_name}" + ) + + worker_connection.run(f"./worker_node_setup.sh {image_uri} {head_ip} {worker_ip}") + + # add timer to let container run + time.sleep(30) + + commands = ["ray status", "fi_info -p efa"] + for command in commands: + head_connection.run(f"docker exec -i {container_name} /bin/bash -c '{command}'") + + serve_command = f"vllm serve {MODEL_NAME} --tensor-parallel-size 8 --pipeline-parallel-size 2 --max-num-batched-tokens 16384" + head_connection.run( + f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'" + ) + + print("Waiting for model to be ready, approx estimated time to complete is 15 mins...") + if not wait_for_container_ready(head_connection, container_name, timeout=2000): + raise Exception("Container failed to become ready within timeout period") + + print("Running benchmark...") + benchmark_cmd = "source vllm_env/bin/activate &&" + create_benchmark_command() + benchmark_result = head_connection.run(benchmark_cmd, timeout=7200) + + return benchmark_result + + except Exception as e: + raise Exception(f"Multi-node test execution failed: {str(e)}") + + +def verify_gpu_setup(connection): + """ + Verify GPU setup on the instance before running the test + + Args: + connection: Fabric connection object to EC2 instance + + Returns: + bool: True if GPU setup is valid, False otherwise + """ + try: + # Check nvidia-smi + result = connection.run("nvidia-smi", hide=True) + if result.failed: + print("nvidia-smi check failed") + return False + + # Check CUDA availability + cuda_check = connection.run("nvidia-smi -L", hide=True) + if cuda_check.failed or "GPU" not in cuda_check.stdout: + print("No GPUs found") + return False + + return True + + except Exception as e: + print(f"GPU verification failed: {str(e)}") + return False + + +def cleanup_containers(connection): + """ + Cleanup docker containers and images on the instance + + Args: + connection: Fabric connection object + """ + try: + print("Cleaning up containers and images...") + commands = [ + "docker ps -aq | xargs -r docker stop", + "docker ps -aq | xargs -r docker rm", + ] + for cmd in commands: + connection.run(cmd, hide=True, warn=True) + except Exception as e: + print(f"Cleanup warning: {str(e)}") + + +def run_multi_node_test(head_conn, worker_conn, image_uri): + """ + Run multi-node VLLM benchmark test + + Args: + head_conn: Fabric connection object for head node + worker_conn: Fabric connection object for worker node + image_uri: ECR image URI + """ + + print("\n=== Starting Multi-Node Test ===") + verification_tasks = [(head_conn, "head"), (worker_conn, "worker")] + for conn, node_type in verification_tasks: + if not verify_gpu_setup(conn): + raise Exception(f"GPU setup verification failed for {node_type} node") + + result = test_vllm_benchmark_on_multi_node(head_conn, worker_conn, image_uri) + if result.ok: + print("Multi-node test completed successfully") + return True + return False + + +def run_single_node_test(head_conn, image_uri): + """ + Run single-node VLLM benchmark test + + Args: + head_conn: Fabric connection object for head node + image_uri: ECR image URI + """ + if not verify_gpu_setup(head_conn): + raise Exception(f"GPU setup verification failed for head node") + + try: + install_python_in_instance(head_conn, python_version="3.10") + + response = get_secret_hf_token() + hf_token = response.get("HF_TOKEN") + + setup_docker_image(head_conn, image_uri) + + head_conn.put( + "v2/ec2/vllm/run_vllm_on_arm64.sh", + "/home/ec2-user/run_vllm_on_arm64.sh", + ) + commands = [ + "chmod +x /home/ec2-user/run_vllm_on_arm64.sh", + f"/home/ec2-user/run_vllm_on_arm64.sh {image_uri} {hf_token}", + ] + + result = head_conn.run( + "; ".join(commands), + hide=False, + timeout=7200, + ) + + except Exception as e: + print(f"Test execution failed: {str(e)}") + raise + + if result.ok: + print("Single-node test completed successfully") + return True + + +def test_vllm_on_ec2(resources, image_uri, test_config=None): + """ + Test VLLM on EC2 instances: + - For non-arm64: EFA testing, Single node test, and Multi-node test + - For arm64: Single node test only + + Args: + resources: Dictionary containing instance information and FSx config + image_uri: Docker image URI to test + test_config: Dictionary containing test configuration (arch_type, framework, etc.) + """ + # Extract arch_type from test_config, fallback to parsing from image_uri if not provided + if test_config and "arch_type" in test_config: + arch_type = test_config["arch_type"] + else: + arch_type = "arm64" if "arm64" in image_uri else "x86_64" + ec2_cli = None + fsx = None + ec2_connections = {} + test_results = {"efa": None, "single_node": None, "multi_node": None} + + # to test agents + + try: + ec2_cli = get_ec2_client(DEFAULT_REGION) + fsx = FsxSetup(DEFAULT_REGION) + + for instance_id, key_filename in resources["instances_info"]: + try: + instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[ + "Reservations" + ][0]["Instances"][0] + public_ip = instance_details.get("PublicIpAddress") + + if not public_ip: + raise Exception(f"No public IP found for instance {instance_id}") + + connection = Connection( + host=public_ip, + user="ec2-user", + connect_kwargs={"key_filename": key_filename}, + ) + + connection.run('echo "Connection test"', hide=True) + ec2_connections[instance_id] = connection + print(f"Successfully connected to instance {instance_id}") + + except Exception as e: + print(f"Failed to connect to instance {instance_id}: {str(e)}") + raise + + is_arm64 = "arm64" in image_uri + instance_ids = list(ec2_connections.keys()) + head_conn = ec2_connections[instance_ids[0]] + + if is_arm64: + print("\n=== Starting ARM64 Single Node Test ===") + test_results["single_node"] = run_single_node_test(head_conn, image_uri) + print( + f"ARM64 Single node test: {'Passed' if test_results['single_node'] else 'Failed'}" + ) + + elif len(ec2_connections) >= 2: + worker_conn = ec2_connections[instance_ids[1]] + + print("\n=== Starting EFA Tests ===") + _setup_multinode_efa_instances( + image_uri, + resources["instances_info"][:2], + [head_conn, worker_conn], + "p4d.24xlarge", + DEFAULT_REGION, + arch_type, + ) + + # Determine the master container name - vLLM uses unique names + master_container_name = get_vllm_container_name("efa", arch_type, "master") + + # Run EFA sanity test + run_cmd_on_container(master_container_name, head_conn, EFA_SANITY_TEST_CMD, hide=False) + + # Run EFA integration test + run_cmd_on_container( + master_container_name, + head_conn, + f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} 2", + hide=False, + timeout=DEFAULT_EFA_TIMEOUT, + ) + + test_results["efa"] = True + + for conn in [head_conn, worker_conn]: + cleanup_containers(conn) + + print("EFA tests completed successfully") + + # Run multi-node test + test_results["multi_node"] = run_multi_node_test(head_conn, worker_conn, image_uri) + + else: + print("\nSkipping multi-node test: insufficient instances") + + print("\n=== Test Summary ===") + for test_name, result in test_results.items(): + if result is not None: + print( + f"{test_name.replace('_', ' ').title()} test: {'Passed' if result else 'Failed'}" + ) + else: + print(f"{test_name.replace('_', ' ').title()} test: Not Run") + + if is_arm64: + if not test_results["single_node"]: + raise Exception("Single node test failed for ARM64") + elif not any(result for result in test_results.values() if result is not None): + raise Exception("All tests failed") + + except Exception as e: + print(f"Test execution failed: {str(e)}") + raise + + finally: + if ec2_cli and fsx: + cleanup_timer = threading.Timer( + 1000, lambda: print("Cleanup timed out, some resources might need manual cleanup") + ) + cleanup_timer.start() + + try: + cleanup_resources(ec2_cli, resources, fsx) + cleanup_timer.cancel() + print("Resources cleaned up successfully") + except Exception as e: + print(f"Cleanup failed: {str(e)}") + finally: + cleanup_timer.cancel() diff --git a/test/v2/ec2/vllm/worker_node_setup.sh b/test/v2/ec2/vllm/worker_node_setup.sh new file mode 100644 index 000000000000..c6670e882fa8 --- /dev/null +++ b/test/v2/ec2/vllm/worker_node_setup.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Usage: ./worker_node_setup.sh +set -e + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +IMAGE_URI=$1 +HEAD_IP=$2 +WORKER_IP=$3 + +tmux new-session -d -s ray_worker "bash /fsx/vllm-dlc/vllm/examples/online_serving/run_cluster.sh \ + $IMAGE_URI \ + $HEAD_IP \ + --worker \ + /fsx/.cache/huggingface \ + -e VLLM_HOST_IP=$WORKER_IP \ + -e FI_PROVIDER=efa \ + -e FI_EFA_USE_DEVICE_RDMA=1 \ + --device=/dev/infiniband/ \ + --ulimit memlock=-1:-1" + +log "Worker node setup complete." \ No newline at end of file From e79206b9d34518477d0c2606029b1832bafc114a Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 13:36:32 -0700 Subject: [PATCH 06/33] connect v2 test infra with test --- infra/test_infra/ec2/setup.py | 80 +++++++++++++++++++++++++++------- infra/test_infra/ec2/utils.py | 1 - infra/test_infra/eks/setup.py | 35 ++++++++------- infra/test_infra/entrypoint.py | 33 +++++++++----- test/test_utils/__init__.py | 3 ++ test/v2/ec2/vllm/test_ec2.py | 16 ++++--- 6 files changed, 119 insertions(+), 49 deletions(-) diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index ea4a0084a54c..48874c8e6e3b 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -1,5 +1,4 @@ import os -import sys from invoke.context import Context from codebuild_environment import get_cloned_folder_path from infra.test_infra.test_infra_utils import create_logger @@ -13,6 +12,8 @@ def __init__(self): self.region = os.getenv("REGION", "us-west-2") self.build_context = os.getenv("BUILD_CONTEXT") self.image_uri = None + self.framework = None + self.arch_type = None self.ctx = Context() def setup(self, params): @@ -21,12 +22,17 @@ def setup(self, params): """ LOGGER.info(f"Setting up EC2 platform with params: {params}") - framework = params.get("framework") + self.framework = params.get("framework") + self.arch_type = params.get("arch_type", "x86_64") self.image_uri = params.get("image_uri") - if framework == "vllm": - # vllm requires vLLM-specific setup (FSx + multi-node) - LOGGER.info(f"Would call vLLM setup for image: {self.image_uri}") + if self.framework == "vllm": + # vLLM requires vLLM-specific setup (FSx + multi-node) + LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}") + from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup + + self.resources = vllm_setup(self.image_uri) + LOGGER.info("vLLM setup completed successfully") else: # standard EC2 setup for other frameworks LOGGER.info(f"Would call standard EC2 setup for image: {self.image_uri}") @@ -34,19 +40,63 @@ def setup(self, params): def execute_command(self, cmd): """ - Execute a test command with proper environment setup + Execute a test command with proper environment setup. + Raises exception immediately if command fails. + """ + try: + # Set up environment variables for all commands + env = { + "AWS_REGION": self.region, + "BUILD_CONTEXT": self.build_context, + "DLC_IMAGE": self.image_uri, + "ARCH_TYPE": self.arch_type, + "FRAMEWORK": self.framework, + } + + # Check if this is a vLLM test command + if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd: + LOGGER.info(f"Executing vLLM test via direct call: {cmd}") + from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2 + + # Pass resources and image_uri; test reads config from env vars + test_vllm_on_ec2(self.resources, self.image_uri) + LOGGER.info(f"Command completed successfully: {cmd}") + else: + # Standard shell command execution for other cases + repo_root = get_cloned_folder_path() + + with self.ctx.cd(repo_root): + LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}") + self.ctx.run(cmd, env=env) + LOGGER.info(f"Command completed successfully: {cmd}") + + except Exception as e: + raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e + + def cleanup(self): + """ + Cleanup EC2 resources """ - env = { - "AWS_REGION": self.region, - "BUILD_CONTEXT": self.build_context, - "DLC_IMAGE": self.image_uri, - } + if not self.resources: + LOGGER.info("No resources to cleanup") + return - repo_root = get_cloned_folder_path() + if self.framework == "vllm": + LOGGER.info("Cleaning up vLLM resources") + try: + from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources + from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup + from infra.test_infra.ec2.utils import get_ec2_client - with self.ctx.cd(repo_root): - LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}") - self.ctx.run(cmd, env=env) + ec2_client = get_ec2_client(self.region) + fsx = FsxSetup(self.region) + cleanup_resources(ec2_client, self.resources, fsx) + LOGGER.info("vLLM cleanup completed successfully") + except Exception as e: + LOGGER.error(f"Error during vLLM cleanup: {e}") + raise + else: + LOGGER.info("Standard EC2 cleanup not yet implemented") def _standard_ec2_setup(self, params): """ diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py index 86e3d0b8a3a8..3159b172db3c 100644 --- a/infra/test_infra/ec2/utils.py +++ b/infra/test_infra/ec2/utils.py @@ -1,7 +1,6 @@ import os import time import re -import logging import sys import uuid import copy diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py index 9e287e77ddde..a69aa07a5921 100644 --- a/infra/test_infra/eks/setup.py +++ b/infra/test_infra/eks/setup.py @@ -1,5 +1,4 @@ import os -import sys from invoke.context import Context from codebuild_environment import get_cloned_folder_path from infra.test_infra.test_infra_utils import create_logger @@ -35,18 +34,24 @@ def setup(self, params): def execute_command(self, cmd): """ - Execute a test command with proper environment setup + Execute a test command with proper environment setup. + Raises exception immediately if command fails. """ - env = { - "AWS_REGION": self.region, - "CLUSTER_NAME": self.cluster_name, - "NAMESPACE": self.namespace, - "BUILD_CONTEXT": self.build_context, - "DLC_IMAGE": self.image_uri, - } - - repo_root = get_cloned_folder_path() - - with self.ctx.cd(repo_root): - LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}") - self.ctx.run(cmd, env=env) + try: + env = { + "AWS_REGION": self.region, + "CLUSTER_NAME": self.cluster_name, + "NAMESPACE": self.namespace, + "BUILD_CONTEXT": self.build_context, + "DLC_IMAGE": self.image_uri, + } + + repo_root = get_cloned_folder_path() + + with self.ctx.cd(repo_root): + LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}") + self.ctx.run(cmd, env=env) + LOGGER.info(f"Command completed successfully: {cmd}") + + except Exception as e: + raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e diff --git a/infra/test_infra/entrypoint.py b/infra/test_infra/entrypoint.py index 7fe859bc087c..9908eeea3511 100644 --- a/infra/test_infra/entrypoint.py +++ b/infra/test_infra/entrypoint.py @@ -1,5 +1,4 @@ import os -import sys from src.config import is_new_test_structure_enabled from test.test_utils import get_dlc_images from codebuild_environment import get_cloned_folder_path @@ -58,16 +57,28 @@ def main(): platform_name = test_config["platform"] LOGGER.info(f"Test config {i+1}: platform={platform_name}") - if test_type == "ec2" and platform_name.startswith("ec2"): - LOGGER.info(f"Executing EC2 test for platform: {platform_name}") - execute_platform_tests(EC2Platform(), test_config, buildspec_data, image_uri) - elif test_type == "eks" and platform_name.startswith("eks"): - LOGGER.info(f"Executing EKS test for platform: {platform_name}") - execute_platform_tests(EKSPlatform(), test_config, buildspec_data, image_uri) - else: - LOGGER.info( - f"Skipping test config {i+1}: test_type={test_type}, platform={platform_name}" - ) + platform = None + try: + if test_type == "ec2" and platform_name.startswith("ec2"): + LOGGER.info(f"Executing EC2 test for platform: {platform_name}") + platform = EC2Platform() + execute_platform_tests(platform, test_config, buildspec_data, image_uri) + elif test_type == "eks" and platform_name.startswith("eks"): + LOGGER.info(f"Executing EKS test for platform: {platform_name}") + platform = EKSPlatform() + execute_platform_tests(platform, test_config, buildspec_data, image_uri) + else: + LOGGER.info( + f"Skipping test config {i+1}: test_type={test_type}, platform={platform_name}" + ) + finally: + # Cleanup resources if platform supports it + if platform is not None and hasattr(platform, "cleanup"): + LOGGER.info(f"Cleaning up platform resources for {platform_name}") + try: + platform.cleanup() + except Exception as e: + LOGGER.error(f"Cleanup failed for {platform_name}: {e}") if __name__ == "__main__": diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 70ddc7d0d564..bc6793da9a1b 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -215,6 +215,9 @@ def get_ami_id_ssm(region_name, parameter_path): # Used for referencing tests scripts from container_tests directory (i.e. from ECS cluster) CONTAINER_TESTS_PREFIX = os.path.join(os.sep, "test", "bin") +# Used for referencing test scripts in the new v2 test structure +CONTAINER_TESTS_PREFIX_V2 = os.path.join(os.sep, "test", "v2", "ec2") + # S3 Bucket to use to transfer tests into an EC2 instance TEST_TRANSFER_S3_BUCKET = f"s3://dlinfra-tests-transfer-bucket-{ACCOUNT_ID}" diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index ba64ee3fcc25..d227902e69d6 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -284,7 +284,7 @@ def run_single_node_test(head_conn, image_uri): return True -def test_vllm_on_ec2(resources, image_uri, test_config=None): +def test_vllm_on_ec2(resources, image_uri): """ Test VLLM on EC2 instances: - For non-arm64: EFA testing, Single node test, and Multi-node test @@ -293,13 +293,15 @@ def test_vllm_on_ec2(resources, image_uri, test_config=None): Args: resources: Dictionary containing instance information and FSx config image_uri: Docker image URI to test - test_config: Dictionary containing test configuration (arch_type, framework, etc.) + + Environment Variables: + ARCH_TYPE: Architecture type (x86_64 or arm64) + AWS_REGION: AWS region + FRAMEWORK: Framework being tested (vllm) """ - # Extract arch_type from test_config, fallback to parsing from image_uri if not provided - if test_config and "arch_type" in test_config: - arch_type = test_config["arch_type"] - else: - arch_type = "arm64" if "arm64" in image_uri else "x86_64" + # Read arch_type from environment variable + import os + arch_type = os.getenv("ARCH_TYPE", "x86_64") ec2_cli = None fsx = None ec2_connections = {} From c45457c82925e4c07c5d35fa5c6802eaedc6d71c Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 13:37:15 -0700 Subject: [PATCH 07/33] add framework and arch_type to eks setup as env vars for consistency --- infra/test_infra/eks/setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py index a69aa07a5921..291af9265925 100644 --- a/infra/test_infra/eks/setup.py +++ b/infra/test_infra/eks/setup.py @@ -14,6 +14,8 @@ def __init__(self): self.cluster_name = None self.namespace = None self.image_uri = None + self.framework = None + self.arch_type = None self.ctx = Context() def setup(self, params): @@ -22,14 +24,15 @@ def setup(self, params): """ LOGGER.info(f"Setting up EKS platform with params: {params}") - framework = params.get("framework") + self.framework = params.get("framework") + self.arch_type = params.get("arch_type", "x86_64") cluster_prefix = params.get("cluster") self.cluster_name = f"{cluster_prefix}-{self.build_context}" self.namespace = params.get("namespace") self.image_uri = params.get("image_uri") LOGGER.info( - f"EKS Platform - Framework: {framework}, Cluster: {self.cluster_name}, Namespace: {self.namespace}" + f"EKS Platform - Framework: {self.framework}, Cluster: {self.cluster_name}, Namespace: {self.namespace}" ) def execute_command(self, cmd): @@ -44,6 +47,8 @@ def execute_command(self, cmd): "NAMESPACE": self.namespace, "BUILD_CONTEXT": self.build_context, "DLC_IMAGE": self.image_uri, + "ARCH_TYPE": self.arch_type, + "FRAMEWORK": self.framework, } repo_root = get_cloned_folder_path() From 7cfbd8e3941d8376d9c63faf3f7464cf3a50d22d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 13:38:57 -0700 Subject: [PATCH 08/33] test run with new path --- dlc_developer_config.toml | 2 +- vllm/buildspec.yml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 90c179d30484..42d5c76fba0a 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -58,7 +58,7 @@ notify_test_failures = false [test] # Set to true to use the new test structure path for frameworks # Off by default (set to false) -use_new_test_structure = false +use_new_test_structure = true ### On by default sanity_tests = true diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index 90dab034e893..a007d441998e 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -52,6 +52,13 @@ images: - ec2 - eks tests: + - platform: ec2-multi-node-efa + params: + instance_type: p4d.24xlarge + node_count: 2 + run: + - python test/v2/ec2/vllm/test_ec2.py + - platform: eks params: cluster: dlc-vllm From 4ecd5b8902e481c40d81d9df1c016143190de4e9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 14:20:04 -0700 Subject: [PATCH 09/33] dummy commit --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 42d5c76fba0a..6bfe70c08d6d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -58,7 +58,7 @@ notify_test_failures = false [test] # Set to true to use the new test structure path for frameworks # Off by default (set to false) -use_new_test_structure = true +use_new_test_structure = true ### On by default sanity_tests = true From bc16bb657dce9435d49710ee9028ee39161d461e Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 16:51:53 -0700 Subject: [PATCH 10/33] fix import and comment out eks path --- infra/test_infra/ec2/utils.py | 4 +++- vllm/buildspec.yml | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py index 3159b172db3c..23dee77758f1 100644 --- a/infra/test_infra/ec2/utils.py +++ b/infra/test_infra/ec2/utils.py @@ -36,8 +36,10 @@ login_to_ecr_registry, get_account_id_from_image_uri, UL_AMI_LIST, + DEFAULT_REGION, + P4DE_REGION, + BENCHMARK_RESULTS_S3_BUCKET, ) -from . import DEFAULT_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET from infra.test_infra.test_infra_utils import create_logger EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole" diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index a007d441998e..e79fe18ff8f8 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -59,9 +59,9 @@ images: run: - python test/v2/ec2/vllm/test_ec2.py - - platform: eks - params: - cluster: dlc-vllm - namespace: vllm - run: - - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + # - platform: eks + # params: + # cluster: dlc-vllm + # namespace: vllm + # run: + # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file From 64bd8782b4f5d8374bfb0341c4f8a1d408a6ea24 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Wed, 15 Oct 2025 20:31:39 -0700 Subject: [PATCH 11/33] update vllm test helper --- test/testrunner.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/testrunner.py b/test/testrunner.py index ef687ee85794..e3f0b73b8e0c 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -290,14 +290,6 @@ def run_vllm_tests(test_type, all_image_list, new_test_structure_enabled): try: LOGGER.info(f"Running vLLM {test_type.upper()} tests with image: {all_image_list[0]}") if new_test_structure_enabled: - project_root = os.path.dirname(os.path.dirname(os.getcwd())) - spec = importlib.util.spec_from_file_location( - "entrypoint", - os.path.join(project_root, ".infra", "test_infra", "entrypoint.py"), - ) - entrypoint_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(entrypoint_module) - run_new_tests = entrypoint_module.main LOGGER.info("Using new buildspec-based test system") run_new_tests() else: From a6cc859f4aa650bc492def88dfaaf138ed82f6bd Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 16 Oct 2025 12:58:12 -0700 Subject: [PATCH 12/33] update container naming logic --- test/v2/ec2/efa/test_efa.py | 77 +++++++++++++++++++++--------------- test/v2/ec2/vllm/test_ec2.py | 6 +-- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index dded29bdffcc..1c1abd5b1675 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -42,19 +42,25 @@ DEFAULT_EFA_TIMEOUT = 300 -def get_vllm_container_name(test_scenario, arch_type, node_role=None): +def get_efa_container_name(framework, test_scenario, arch_type, node_role=None): """ - Generate unique container name for vLLM v2 EC2 tests + Generate unique container name for EC2 EFA tests Args: - test_scenario: Test scenario (e.g., "efa", "single-node") + framework: Framework name (e.g., "vllm", "pytorch", "tensorflow") + test_scenario: Test scenario - "efa" arch_type: Architecture from buildspec (e.g., "x86_64", "arm64") - node_role: For multi-node: "master", "worker-0", etc. (optional) + node_role: For multi-node: "master", "worker-0", etc. Returns: - Container name like "vllm-ec2-efa-x86_64-master" or "vllm-ec2-single-node-arm64" + Container name like "vllm-ec2-efa-x86_64-master" """ - base_name = f"vllm-ec2-{test_scenario}-{arch_type}" + # Try to get framework from environment variable first + detected_framework = os.environ.get("FRAMEWORK") + if not detected_framework: + detected_framework = framework + + base_name = f"{detected_framework}-ec2-{test_scenario}-{arch_type}" return f"{base_name}-{node_role}" if node_role else base_name @@ -196,7 +202,7 @@ def _setup_multinode_efa_instances( # Use provided arch_type or infer from image as fallback if arch_type is None: arch_type = "arm64" if "arm64" in image else "x86_64" - master_container_name = get_vllm_container_name("efa", arch_type, "master") + master_container_name = get_efa_container_name("vllm", "efa", arch_type, "master") else: master_container_name = MASTER_CONTAINER_NAME @@ -216,7 +222,7 @@ def _setup_multinode_efa_instances( for idx, worker_connection in enumerate(efa_ec2_connections[1:]): # Determine worker container name if "vllm" in image: - worker_container_name = get_vllm_container_name("efa", arch_type, f"worker-{idx}") + worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}") else: worker_container_name = WORKER_CONTAINER_NAME @@ -233,22 +239,28 @@ def _setup_multinode_efa_instances( build_all_reduce_perf_promises.append(promise) # Configure master node SSH client-side configurations - _setup_master_efa_ssh_config(master_connection) + _setup_master_efa_ssh_config(master_connection, master_container_name) # Create a hosts file that provides mpi with IP addresses and no. of GPUs in each node worker_instance_ids = [instance_id for instance_id, _ in efa_ec2_instances[1:]] _create_master_mpi_hosts_file( - efa_ec2_connections, worker_instance_ids, ec2_instance_type, region + efa_ec2_connections, worker_instance_ids, ec2_instance_type, region, master_container_name ) # Obtain master node SSH public key for future use master_pub_key = run_cmd_on_container( - MASTER_CONTAINER_NAME, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub" + master_container_name, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub" ).stdout.strip("\n") # Configure worker node containers - for worker_connection in efa_ec2_connections[1:]: + for idx, worker_connection in enumerate(efa_ec2_connections[1:]): + # Determine worker container name + if "vllm" in image: + worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}") + else: + worker_container_name = WORKER_CONTAINER_NAME + # Configure worker node SSH server-side configurations, launch SSH daemon, and allow # password-less SSH access from master to worker nodes. - _setup_worker_efa_ssh_config(worker_connection, master_pub_key) + _setup_worker_efa_ssh_config(worker_connection, master_pub_key, worker_container_name) # Wait for all_reduce_perf binaries to be built in all containers for promise in build_all_reduce_perf_promises: @@ -302,18 +314,19 @@ def _setup_container(connection, docker_image, container_name): ) -def _setup_master_efa_ssh_config(connection): +def _setup_master_efa_ssh_config(connection, master_container_name): """ Set up SSH client config on master container to connect to worker :param connection: Fabric Connection object + :param master_container_name: str master container name """ run_cmd_on_container( - MASTER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*" + master_container_name, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*" ) # When running container in --network=host, the container hostname changes, requiring # a new SSH key to be generated. run_cmd_on_container( - MASTER_CONTAINER_NAME, + master_container_name, connection, f"""ssh-keygen -t rsa -f $HOME/.ssh/{MASTER_SSH_KEY_NAME} -N "" """, ) @@ -327,20 +340,21 @@ def _setup_master_efa_ssh_config(connection): " Port 2022" ) run_cmd_on_container( - MASTER_CONTAINER_NAME, + master_container_name, connection, f"""echo -e "{master_container_ssh_config}" > $HOME/.ssh/config""", ) - run_cmd_on_container(MASTER_CONTAINER_NAME, connection, "chmod -R 600 $HOME/.ssh/*") + run_cmd_on_container(master_container_name, connection, "chmod -R 600 $HOME/.ssh/*") -def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region): +def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name): """ Create MPI Hosts file that contains private IP addresses of all hosts used in training job. :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections] :param worker_instance_ids: list of str worker instance IDs :param instance_type: str EC2 Instance Type being used :param region: str region name in which test is run + :param master_container_name: str master container name """ master_connection = efa_ec2_connections[0] slots = ec2_utils.get_instance_num_gpus(instance_type=instance_type) @@ -368,11 +382,11 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst compute_counter += 1 run_cmd_on_container( - MASTER_CONTAINER_NAME, master_connection, f"""echo "{etc_string}" > /etc/hosts""" + master_container_name, master_connection, f"""echo "{etc_string}" > /etc/hosts""" ) run_cmd_on_container( - MASTER_CONTAINER_NAME, + master_container_name, master_connection, f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", ) @@ -383,55 +397,56 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst hosts_string += f"\n{worker_ip} slots={slots} " run_cmd_on_container( - MASTER_CONTAINER_NAME, + master_container_name, master_connection, f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""", ) -def _setup_worker_efa_ssh_config(connection, master_pub_key): +def _setup_worker_efa_ssh_config(connection, master_pub_key, worker_container_name): """ Set up SSH server config on worker container to allow connections from master. :param connection: Fabric Connection object :param master_pub_key: str Master node public SSH key to allow password-less SSH access + :param worker_container_name: str worker container name """ # Force SSH Daemon to use port 2022, since port 22 is already in use by the host instance run_cmd_on_container( - WORKER_CONTAINER_NAME, connection, """echo "Port 2022" >> /etc/ssh/sshd_config""" + worker_container_name, connection, """echo "Port 2022" >> /etc/ssh/sshd_config""" ) run_cmd_on_container( - WORKER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*" + worker_container_name, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*" ) # When running container in --network=host, the container hostname changes, requiring # a new SSH key to be generated. run_cmd_on_container( - WORKER_CONTAINER_NAME, + worker_container_name, connection, f"""ssh-keygen -t rsa -f $HOME/.ssh/{WORKER_SSH_KEY_NAME} -N "" """, ) # Add both self and master public keys to authorized keys to allow password-less access to # this container from authorized hosts. run_cmd_on_container( - WORKER_CONTAINER_NAME, + worker_container_name, connection, f"cp $HOME/.ssh/{WORKER_SSH_KEY_NAME}.pub $HOME/.ssh/authorized_keys", ) run_cmd_on_container( - WORKER_CONTAINER_NAME, + worker_container_name, connection, f"""echo "{master_pub_key}" >> $HOME/.ssh/authorized_keys""", ) # Check if ssh agent is running or not, and if not, run it. run_cmd_on_container( - WORKER_CONTAINER_NAME, + worker_container_name, connection, f"eval `ssh-agent -s` && ssh-add $HOME/.ssh/{WORKER_SSH_KEY_NAME}", ) # Start SSH service which uses configurations from /etc/ssh/sshd_config - run_cmd_on_container(WORKER_CONTAINER_NAME, connection, "service ssh start") + run_cmd_on_container(worker_container_name, connection, "service ssh start") # Check status of SSH service, and fail test-setup if service doesn't run correctly. ssh_status = run_cmd_on_container( - WORKER_CONTAINER_NAME, connection, "service ssh status", warn=True + worker_container_name, connection, "service ssh status", warn=True ) if ssh_status.failed: raise RuntimeError("Failed to setup SSH Daemon on worker node") diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index d227902e69d6..2f5b1895e854 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -21,7 +21,7 @@ HOSTS_FILE_LOCATION, EFA_INTEGRATION_TEST_CMD, DEFAULT_EFA_TIMEOUT, - get_vllm_container_name, + get_efa_container_name, ) from test.test_utils import run_cmd_on_container @@ -361,8 +361,8 @@ def test_vllm_on_ec2(resources, image_uri): arch_type, ) - # Determine the master container name - vLLM uses unique names - master_container_name = get_vllm_container_name("efa", arch_type, "master") + # Determine the master container name + master_container_name = get_efa_container_name("vllm", "efa", arch_type, "master") # Run EFA sanity test run_cmd_on_container(master_container_name, head_conn, EFA_SANITY_TEST_CMD, hide=False) From feb3a96f0bf74b30b7687f9cac5cdc0a4f541534 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 16 Oct 2025 14:48:22 -0700 Subject: [PATCH 13/33] move missing efa test files --- test/v2/ec2/efa/build_all_reduce_perf.sh | 20 ++++++++++++++ test/v2/ec2/efa/efa_test.sh | 33 ++++++++++++++++++++++++ test/v2/ec2/efa/testEFASanity | 2 +- 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 test/v2/ec2/efa/build_all_reduce_perf.sh create mode 100644 test/v2/ec2/efa/efa_test.sh diff --git a/test/v2/ec2/efa/build_all_reduce_perf.sh b/test/v2/ec2/efa/build_all_reduce_perf.sh new file mode 100644 index 000000000000..70f0cfecaaed --- /dev/null +++ b/test/v2/ec2/efa/build_all_reduce_perf.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# For dockerfiles of PyTorch >= 2.0, CUDA_HOME is already set as an env, and is configured as /opt/conda +python -c "import torch; from packaging.version import Version; assert Version(torch.__version__) >= Version('2.0')" +TORCH_VERSION_2x=$? +if [ $TORCH_VERSION_2x -ne 0 ]; then + CUDA_HOME=/usr/local/cuda +fi + +set -e + +echo "Building all_reduce_perf from nccl-tests" +cd /tmp/ +rm -rf nccl-tests/ +git clone https://github.com/NVIDIA/nccl-tests.git +cd nccl-tests/ +make MPI=1 MPI_HOME=/opt/amazon/openmpi NCCL_HOME=/usr/local CUDA_HOME=${CUDA_HOME} +cp build/all_reduce_perf /all_reduce_perf +cd /tmp/ +rm -rf nccl-tests/ \ No newline at end of file diff --git a/test/v2/ec2/efa/efa_test.sh b/test/v2/ec2/efa/efa_test.sh new file mode 100644 index 000000000000..2a960e6023d7 --- /dev/null +++ b/test/v2/ec2/efa/efa_test.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +set -ex + +# Script to run fi_pingpong locally over EFA to test connectivity. + +if ! command -v fi_info >/dev/null 2>&1 || ! command -v fi_pingpong >/dev/null 2>&1; then + echo "Error: required libfabric binaries not found." + exit 1 +fi + +if ! fi_info -p efa >/dev/null 2>&1; then + echo "Error: EFA libfabric provider not detected." >&2 + exit 1 +fi + +echo "Starting server..." +FI_EFA_ENABLE_SHM_TRANSFER=0 fi_pingpong -e rdm -p efa >/dev/null 2>&1 & +sleep 0.5 + +echo "Starting client..." +FI_EFA_ENABLE_SHM_TRANSFER=0 timeout 8 fi_pingpong -e rdm -p efa localhost +ret=$? +if [ $ret -ne 0 ]; then + if [ $ret -eq 124 ]; then + echo "Error: fi_pingpong test timed out." >&2 + else + echo "Error: fi_pingpong test returned $ret." >&2 + fi +fi +kill %1 +exit $ret diff --git a/test/v2/ec2/efa/testEFASanity b/test/v2/ec2/efa/testEFASanity index 1f350628c668..051f77351a65 100644 --- a/test/v2/ec2/efa/testEFASanity +++ b/test/v2/ec2/efa/testEFASanity @@ -18,7 +18,7 @@ apt-get update && apt-get install -y kmod lsmod | grep ib_uverbs # ensure that the security group created is configured correctly -/test/bin/efa/efa_test.sh +/test/v2/ec2/efa/efa_test.sh # Queries local RDMA devices ibv_devinfo From 35f0d3e974c4ba803c87f52ccbc44a6d97fd08e9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 16 Oct 2025 16:42:29 -0700 Subject: [PATCH 14/33] make container path and instance path consistent --- infra/test_infra/ec2/vllm/setup_ec2.py | 4 ++-- test/v2/ec2/efa/test_efa.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 5138ce4b20b9..0528d0ab9d05 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -39,8 +39,8 @@ # V2 test path constants V2_LOCAL_TEST_PATH = "test/v2" -V2_INSTANCE_PATH = "$HOME/test_v2" -V2_CONTAINER_PATH = "/test_v2" +V2_INSTANCE_PATH = "$HOME/test/v2" +V2_CONTAINER_PATH = "/test/v2" TEST_ID = str(uuid.uuid4()) diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 1c1abd5b1675..272195b6e355 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -305,12 +305,12 @@ def _setup_container(connection, docker_image, container_name): if "vllm" in docker_image: connection.run( f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " - f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image}" + f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image}" ) else: connection.run( f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " - f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image} bash" + f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash" ) From ae024f1284a538b09874a0fa3bb5b73434bbcbc3 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Thu, 16 Oct 2025 18:02:47 -0700 Subject: [PATCH 15/33] move cleanup logic to infra dir --- infra/test_infra/ec2/setup.py | 11 +++++++++++ test/v2/ec2/vllm/test_ec2.py | 21 ++------------------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index 48874c8e6e3b..f7f6d00aea71 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -1,4 +1,5 @@ import os +import threading from invoke.context import Context from codebuild_environment import get_cloned_folder_path from infra.test_infra.test_infra_utils import create_logger @@ -83,6 +84,13 @@ def cleanup(self): if self.framework == "vllm": LOGGER.info("Cleaning up vLLM resources") + + cleanup_timer = threading.Timer( + 1000, + lambda: LOGGER.warning("Cleanup timed out, some resources might need manual cleanup") + ) + cleanup_timer.start() + try: from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup @@ -91,10 +99,13 @@ def cleanup(self): ec2_client = get_ec2_client(self.region) fsx = FsxSetup(self.region) cleanup_resources(ec2_client, self.resources, fsx) + cleanup_timer.cancel() LOGGER.info("vLLM cleanup completed successfully") except Exception as e: LOGGER.error(f"Error during vLLM cleanup: {e}") raise + finally: + cleanup_timer.cancel() else: LOGGER.info("Standard EC2 cleanup not yet implemented") diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index 2f5b1895e854..aef003491df0 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -1,4 +1,3 @@ -import threading import boto3 import time, json from botocore.exceptions import ClientError @@ -7,13 +6,13 @@ from infra.test_infra.ec2.utils import ( get_account_id_from_image_uri, login_to_ecr_registry, - get_ec2_client, install_python_in_instance, + get_ec2_client, ) from infra.test_infra.test_infra_utils import create_logger from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup -from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources, TEST_ID +from infra.test_infra.ec2.vllm.setup_ec2 import TEST_ID from test.v2.ec2.efa.test_efa import ( _setup_multinode_efa_instances, EFA_SANITY_TEST_CMD, @@ -407,19 +406,3 @@ def test_vllm_on_ec2(resources, image_uri): except Exception as e: print(f"Test execution failed: {str(e)}") raise - - finally: - if ec2_cli and fsx: - cleanup_timer = threading.Timer( - 1000, lambda: print("Cleanup timed out, some resources might need manual cleanup") - ) - cleanup_timer.start() - - try: - cleanup_resources(ec2_cli, resources, fsx) - cleanup_timer.cancel() - print("Resources cleaned up successfully") - except Exception as e: - print(f"Cleanup failed: {str(e)}") - finally: - cleanup_timer.cancel() From bffcebdf8f2ac7105a36777d6c46e564c885d711 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Fri, 17 Oct 2025 14:16:41 -0700 Subject: [PATCH 16/33] add debugging, fix resource passing & formatting --- infra/test_infra/ec2/setup.py | 13 ++++--- infra/test_infra/ec2/utils.py | 1 + infra/test_infra/eks/setup.py | 2 +- test/v2/ec2/efa/test_efa.py | 26 ++++++++++--- test/v2/ec2/vllm/test_ec2.py | 73 +++++++++++++++++++++++++---------- 5 files changed, 83 insertions(+), 32 deletions(-) diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index f7f6d00aea71..102563cd2519 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -55,6 +55,7 @@ def execute_command(self, cmd): } # Check if this is a vLLM test command + # TODO: check if there is a better way to handle this if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd: LOGGER.info(f"Executing vLLM test via direct call: {cmd}") from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2 @@ -70,7 +71,7 @@ def execute_command(self, cmd): LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}") self.ctx.run(cmd, env=env) LOGGER.info(f"Command completed successfully: {cmd}") - + except Exception as e: raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e @@ -84,13 +85,15 @@ def cleanup(self): if self.framework == "vllm": LOGGER.info("Cleaning up vLLM resources") - + cleanup_timer = threading.Timer( - 1000, - lambda: LOGGER.warning("Cleanup timed out, some resources might need manual cleanup") + 1000, + lambda: LOGGER.warning( + "Cleanup timed out, some resources might need manual cleanup" + ), ) cleanup_timer.start() - + try: from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py index 23dee77758f1..ec3204d46ec0 100644 --- a/infra/test_infra/ec2/utils.py +++ b/infra/test_infra/ec2/utils.py @@ -57,6 +57,7 @@ LOGGER = create_logger(__name__) + def filter_only_multi_gpu(instance_type_list): filtered_list = [ instance_type diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py index 291af9265925..bfffe5df3e07 100644 --- a/infra/test_infra/eks/setup.py +++ b/infra/test_infra/eks/setup.py @@ -57,6 +57,6 @@ def execute_command(self, cmd): LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}") self.ctx.run(cmd, env=env) LOGGER.info(f"Command completed successfully: {cmd}") - + except Exception as e: raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 272195b6e355..251d2ff88337 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -59,7 +59,7 @@ def get_efa_container_name(framework, test_scenario, arch_type, node_role=None): detected_framework = os.environ.get("FRAMEWORK") if not detected_framework: detected_framework = framework - + base_name = f"{detected_framework}-ec2-{test_scenario}-{arch_type}" return f"{base_name}-{node_role}" if node_role else base_name @@ -209,6 +209,16 @@ def _setup_multinode_efa_instances( build_all_reduce_perf_promises = [] # Run container _setup_container(master_connection, image, master_container_name) + + # Verify files are visible inside container + print(f"Verifying files inside {master_container_name} container...") + run_cmd_on_container( + master_container_name, + master_connection, + "ls -la /test/v2/ec2/efa/", + hide=False, + ) + # Build all_reduce_perf binary using nccl-tests promise = run_cmd_on_container( master_container_name, @@ -222,7 +232,9 @@ def _setup_multinode_efa_instances( for idx, worker_connection in enumerate(efa_ec2_connections[1:]): # Determine worker container name if "vllm" in image: - worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}") + worker_container_name = get_efa_container_name( + "vllm", "efa", arch_type, f"worker-{idx}" + ) else: worker_container_name = WORKER_CONTAINER_NAME @@ -254,10 +266,12 @@ def _setup_multinode_efa_instances( for idx, worker_connection in enumerate(efa_ec2_connections[1:]): # Determine worker container name if "vllm" in image: - worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}") + worker_container_name = get_efa_container_name( + "vllm", "efa", arch_type, f"worker-{idx}" + ) else: worker_container_name = WORKER_CONTAINER_NAME - + # Configure worker node SSH server-side configurations, launch SSH daemon, and allow # password-less SSH access from master to worker nodes. _setup_worker_efa_ssh_config(worker_connection, master_pub_key, worker_container_name) @@ -347,7 +361,9 @@ def _setup_master_efa_ssh_config(connection, master_container_name): run_cmd_on_container(master_container_name, connection, "chmod -R 600 $HOME/.ssh/*") -def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name): +def _create_master_mpi_hosts_file( + efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name +): """ Create MPI Hosts file that contains private IP addresses of all hosts used in training job. :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections] diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index aef003491df0..9aa13761f9e3 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -292,7 +292,7 @@ def test_vllm_on_ec2(resources, image_uri): Args: resources: Dictionary containing instance information and FSx config image_uri: Docker image URI to test - + Environment Variables: ARCH_TYPE: Architecture type (x86_64 or arm64) AWS_REGION: AWS region @@ -300,6 +300,7 @@ def test_vllm_on_ec2(resources, image_uri): """ # Read arch_type from environment variable import os + arch_type = os.getenv("ARCH_TYPE", "x86_64") ec2_cli = None fsx = None @@ -312,28 +313,47 @@ def test_vllm_on_ec2(resources, image_uri): ec2_cli = get_ec2_client(DEFAULT_REGION) fsx = FsxSetup(DEFAULT_REGION) - for instance_id, key_filename in resources["instances_info"]: - try: - instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[ - "Reservations" - ][0]["Instances"][0] - public_ip = instance_details.get("PublicIpAddress") - - if not public_ip: - raise Exception(f"No public IP found for instance {instance_id}") - - connection = Connection( - host=public_ip, - user="ec2-user", - connect_kwargs={"key_filename": key_filename}, + # Use existing connections from resources if available, otherwise create new ones + if "connections" in resources and resources["connections"]: + print("Using existing connections from setup phase") + # Use connections that were created during setup_test_artifacts() + ec2_connections = { + instance_id: conn + for (instance_id, _), conn in zip( + resources["instances_info"], resources["connections"] ) - - connection.run('echo "Connection test"', hide=True) - ec2_connections[instance_id] = connection - print(f"Successfully connected to instance {instance_id}") - + } + else: + print("Creating new connections to instances") + for instance_id, key_filename in resources["instances_info"]: + try: + instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[ + "Reservations" + ][0]["Instances"][0] + public_ip = instance_details.get("PublicIpAddress") + + if not public_ip: + raise Exception(f"No public IP found for instance {instance_id}") + + connection = Connection( + host=public_ip, + user="ec2-user", + connect_kwargs={"key_filename": key_filename}, + ) + + ec2_connections[instance_id] = connection + + except Exception as e: + print(f"Failed to connect to instance {instance_id}: {str(e)}") + raise + + # Verify all connections are working + for instance_id, conn in ec2_connections.items(): + try: + conn.run('echo "Connection test"', hide=True) + print(f"Successfully verified connection to instance {instance_id}") except Exception as e: - print(f"Failed to connect to instance {instance_id}: {str(e)}") + print(f"Connection test failed for instance {instance_id}: {str(e)}") raise is_arm64 = "arm64" in image_uri @@ -350,6 +370,17 @@ def test_vllm_on_ec2(resources, image_uri): elif len(ec2_connections) >= 2: worker_conn = ec2_connections[instance_ids[1]] + # Verify test files exist before starting containers + print("\n=== Verifying test files on EC2 instances ===") + for conn_id, conn in ec2_connections.items(): + result = conn.run("ls -la $HOME/test/v2/ec2/efa/", warn=True) + if result.failed: + raise Exception( + f"Test files not found at $HOME/test/v2/ec2/efa/ on instance {conn_id}" + ) + print(f"Instance {conn_id} test files:") + print(result.stdout) + print("\n=== Starting EFA Tests ===") _setup_multinode_efa_instances( image_uri, From 1e04c399d6ba6eb0fdf00053a8b21a37d9cf8f86 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 19 Oct 2025 14:33:24 -0700 Subject: [PATCH 17/33] store connection params instead of objects --- infra/test_infra/ec2/vllm/setup_ec2.py | 18 +++++++++++++----- test/v2/ec2/vllm/test_ec2.py | 26 ++++++++++++++++---------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 0528d0ab9d05..c4958079ea9d 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -140,6 +140,7 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region): ec2_connections = {} master_connection = None worker_connection = None + connection_params = [] for instance in instances: instance_id = instance["InstanceId"] @@ -162,6 +163,14 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region): connection.run('echo "Connection test"', hide=True) ec2_connections[instance_id] = connection + # Store connection parameters for later recreation + connection_params.append({ + "instance_id": instance_id, + "host": public_ip, + "user": "ec2-user", + "key_filename": key_filename + }) + if not master_connection: master_connection = connection else: @@ -204,9 +213,8 @@ def delete_s3_artifact_copy(): finally: delete_s3_artifact_copy() - if worker_connection: - return [master_connection, worker_connection] - return [master_connection] + # Return connection parameters + return connection_params def launch_regular_instances_with_retry( @@ -342,13 +350,13 @@ def efa_ec2_instances( delete_elastic_ips(elastic_ip_allocation_ids, ec2_client) raise Exception(f"Error allocating elastic IP: {str(e)}") - connections = setup_test_artifacts(ec2_client, instances, key_filename, region) + connection_params = setup_test_artifacts(ec2_client, instances, key_filename, region) return_val = { "instances": [ (instance_info["InstanceId"], key_filename) for instance_info in instances ], "elastic_ips": elastic_ip_allocation_ids, - "connections": connections, + "connection_params": connection_params, } print("Launched EFA Test instances") return return_val diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index 9aa13761f9e3..c66376cdcb06 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -313,16 +313,22 @@ def test_vllm_on_ec2(resources, image_uri): ec2_cli = get_ec2_client(DEFAULT_REGION) fsx = FsxSetup(DEFAULT_REGION) - # Use existing connections from resources if available, otherwise create new ones - if "connections" in resources and resources["connections"]: - print("Using existing connections from setup phase") - # Use connections that were created during setup_test_artifacts() - ec2_connections = { - instance_id: conn - for (instance_id, _), conn in zip( - resources["instances_info"], resources["connections"] - ) - } + # Recreate connections from stored parameters if available, otherwise create new ones + if "connection_params" in resources and resources["connection_params"]: + print("Recreating connections from stored parameters") + # Recreate fresh Connection objects from parameters stored during setup_test_artifacts() + for params in resources["connection_params"]: + try: + connection = Connection( + host=params["host"], + user=params["user"], + connect_kwargs={"key_filename": params["key_filename"]}, + ) + ec2_connections[params["instance_id"]] = connection + print(f"Recreated connection to instance {params['instance_id']}") + except Exception as e: + print(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}") + raise else: print("Creating new connections to instances") for instance_id, key_filename in resources["instances_info"]: From 523c335e982b320fa5dd57ae267586a328c8b671 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 19 Oct 2025 19:30:15 -0700 Subject: [PATCH 18/33] fix connections reference --- infra/test_infra/ec2/vllm/setup_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index c4958079ea9d..88f2ea2bf907 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -601,7 +601,7 @@ def setup(image): instance_result = launch_ec2_instances(ec2_cli, image) resources["instances_info"] = instance_result["instances"] resources["elastic_ips"] = instance_result["elastic_ips"] - resources["connections"] = instance_result["connections"] + resources["connection_params"] = instance_result["connection_params"] print("Waiting 60 seconds for instances to initialize...") time.sleep(60) From 4e33e8347f86b796b49bd935d0517b14c7afc691 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Sun, 19 Oct 2025 20:37:56 -0700 Subject: [PATCH 19/33] add debug print --- infra/test_infra/ec2/vllm/setup_ec2.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 88f2ea2bf907..4bc14849d8a5 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -196,6 +196,13 @@ def delete_s3_artifact_copy(): f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") + + # Debug: check dir structure + print("=== DEBUG: Master instance dir structure ===") + print(f"Contents of {V2_INSTANCE_PATH}:") + master_connection.run(f"ls -la {V2_INSTANCE_PATH}/") + print("=== END DEBUG ===\n") + master_connection.run( f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" ) @@ -206,6 +213,13 @@ def delete_s3_artifact_copy(): f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") + + # Debug: Check directory structure + print("=== DEBUG: Worker instance directory structure ===") + print(f"Contents of {V2_INSTANCE_PATH}:") + worker_connection.run(f"ls -la {V2_INSTANCE_PATH}/") + print("=== END DEBUG ===\n") + worker_connection.run( f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" ) From e16cedfe8f35786e9a75a06b78a2e4e40cb537d1 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 11:51:40 -0700 Subject: [PATCH 20/33] update v2 test structure s3 upload & download paths --- infra/test_infra/ec2/vllm/setup_ec2.py | 22 +++++++++++----------- test/test_utils/__init__.py | 11 ++++++++--- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 4bc14849d8a5..73aa12ace19f 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -39,7 +39,7 @@ # V2 test path constants V2_LOCAL_TEST_PATH = "test/v2" -V2_INSTANCE_PATH = "$HOME/test/v2" +INSTANCE_TEST_BASE_PATH = "$HOME/test" V2_CONTAINER_PATH = "/test/v2" TEST_ID = str(uuid.uuid4()) @@ -191,37 +191,37 @@ def delete_s3_artifact_copy(): try: # Setup master instance if master_connection: - master_connection.run(f"rm -rf {V2_INSTANCE_PATH}") + master_connection.run(f"rm -rf {INSTANCE_TEST_BASE_PATH}") master_connection.run( - f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" + f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") # Debug: check dir structure print("=== DEBUG: Master instance dir structure ===") - print(f"Contents of {V2_INSTANCE_PATH}:") - master_connection.run(f"ls -la {V2_INSTANCE_PATH}/") + print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:") + master_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/") print("=== END DEBUG ===\n") master_connection.run( - f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" + f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" ) if worker_connection: - worker_connection.run(f"rm -rf {V2_INSTANCE_PATH}") + worker_connection.run(f"rm -rf {INSTANCE_TEST_BASE_PATH}") worker_connection.run( - f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" + f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") # Debug: Check directory structure print("=== DEBUG: Worker instance directory structure ===") - print(f"Contents of {V2_INSTANCE_PATH}:") - worker_connection.run(f"ls -la {V2_INSTANCE_PATH}/") + print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:") + worker_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/") print("=== END DEBUG ===\n") worker_connection.run( - f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*" + f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" ) finally: diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index a908467230bc..54461488a75f 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -1309,9 +1309,14 @@ def upload_tests_to_s3(testname_datetime_suffix): raise EnvironmentError("Test is being run from wrong path") while os.path.basename(path) != "dlc_tests": path = os.path.dirname(path) - container_tests_path = os.path.join(path, "container_tests") - - run(f"aws s3 cp --recursive {container_tests_path}/ {s3_test_location}/") + + # If if new test structure is enabled, upload only v2 directory for new test structure + if is_new_test_structure_enabled(): + v2_path = os.path.join(os.path.dirname(path), "v2") + run(f"aws s3 cp --recursive {v2_path}/ {s3_test_location}/v2/") + else: + container_tests_path = os.path.join(path, "container_tests") + run(f"aws s3 cp --recursive {container_tests_path}/ {s3_test_location}/") return s3_test_location From c903d19a9dc6235189815c9b1eb2a0bebb79fc61 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 12:47:14 -0700 Subject: [PATCH 21/33] update training log path --- test/v2/ec2/efa/testEFA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/v2/ec2/efa/testEFA b/test/v2/ec2/efa/testEFA index 4b676249d816..75972864c2cd 100644 --- a/test/v2/ec2/efa/testEFA +++ b/test/v2/ec2/efa/testEFA @@ -20,7 +20,7 @@ NODES=$(($GPU_COUNT * $NUM_HOSTS)) PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME) -TRAINING_LOG="/test/logs/testEFA.log" +TRAINING_LOG="/test/v2/logs/testEFA.log" USE_DEVICE_RDMA_ARG="" From fef320df98b23e110b7622b226a1aa01001b97dc Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 14:27:20 -0700 Subject: [PATCH 22/33] add logging for container names --- test/v2/ec2/efa/test_efa.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 251d2ff88337..cb1a8bd2be1f 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -21,6 +21,9 @@ filter_efa_instance_type, filter_efa_only_p4_instance_type, ) +from infra.test_infra.test_infra_utils import create_logger + +LOGGER = create_logger(__name__) BUILD_ALL_REDUCE_PERF_CMD = os.path.join( CONTAINER_TESTS_PREFIX_V2, "efa", "build_all_reduce_perf.sh" @@ -206,6 +209,8 @@ def _setup_multinode_efa_instances( else: master_container_name = MASTER_CONTAINER_NAME + LOGGER.info(f"Master container name: {master_container_name}") + build_all_reduce_perf_promises = [] # Run container _setup_container(master_connection, image, master_container_name) @@ -238,6 +243,8 @@ def _setup_multinode_efa_instances( else: worker_container_name = WORKER_CONTAINER_NAME + LOGGER.info(f"Worker container name: {worker_container_name}") + # Run container _setup_container(worker_connection, image, worker_container_name) # Build all_reduce_perf binary using nccl-tests @@ -326,6 +333,8 @@ def _setup_container(connection, docker_image, container_name): f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash" ) + + LOGGER.info(f"Container {container_name} started successfully") def _setup_master_efa_ssh_config(connection, master_container_name): From 3b893c4901414dab1a3dcd2496915e2e2e5bedd9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 14:56:18 -0700 Subject: [PATCH 23/33] clean up debug logs and make prints logs --- infra/test_infra/ec2/vllm/fsx_utils.py | 16 +++--- infra/test_infra/ec2/vllm/setup_ec2.py | 68 ++++++++++------------- test/v2/ec2/efa/test_efa.py | 18 +++---- test/v2/ec2/vllm/test_ec2.py | 75 +++++++++++--------------- 4 files changed, 77 insertions(+), 100 deletions(-) diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py index ce27840fff90..a7b1b0dc4446 100644 --- a/infra/test_infra/ec2/vllm/fsx_utils.py +++ b/infra/test_infra/ec2/vllm/fsx_utils.py @@ -72,7 +72,7 @@ def delete_fsx_filesystem(self, fsx_id: str): f" --output text" ).stdout.strip() - print(f"Deleted FSx filesystem: {fsx_id}") + LOGGER.info(f"Deleted FSx filesystem: {fsx_id}") except Exception as e: LOGGER.error(f"Failed to create FSx filesystem: {e}") @@ -85,7 +85,7 @@ def wait_for_filesystem(self, filesystem_id: str): : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name) : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state """ - print(f"Waiting for FSx filesystem {filesystem_id} to be available...") + LOGGER.info(f"Waiting for FSx filesystem {filesystem_id} to be available...") while True: status = run( f"aws fsx describe-file-systems --file-system-id {filesystem_id} " @@ -97,7 +97,7 @@ def wait_for_filesystem(self, filesystem_id: str): elif status in ["FAILED", "DELETING", "DELETED"]: raise Exception(f"FSx filesystem entered {status} state") - print(f"FSx status: {status}, waiting...") + LOGGER.info(f"FSx status: {status}, waiting...") time.sleep(30) # get fs DNS and mount name @@ -130,12 +130,12 @@ def create_fsx_security_group(self, ec2_cli, vpc_id, group_name, description): VpcId=vpc_id, ) sg_id = response["GroupId"] - print(f"Created security group: {sg_id}") + LOGGER.info(f"Created security group: {sg_id}") return sg_id except ClientError as e: - print(f"An error occurred: {e}") + LOGGER.info(f"An error occurred: {e}") return None def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids): @@ -173,12 +173,12 @@ def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids): } ], ) - print( + LOGGER.info( f"Added inbound rules to FSx security group {sg_id} for instance security groups: {instance_sg_ids}" ) except Exception as e: - print(f"Error adding ingress rules: {str(e)}") + LOGGER.info(f"Error adding ingress rules: {str(e)}") raise def delete_security_group(self, ec2_cli, group_id: str): @@ -195,7 +195,7 @@ def delete_security_group(self, ec2_cli, group_id: str): GroupId=group_id, ) sg_id = response["GroupId"] - print(f"Deleted security group: {sg_id}") + LOGGER.info(f"Deleted security group: {sg_id}") except Exception as e: LOGGER.error(f"Failed to delete security group: {e}") diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 73aa12ace19f..2f08600b4a90 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -176,10 +176,10 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region): else: worker_connection = connection - print(f"Successfully connected to instance {instance_id}") + LOGGER.info(f"Successfully connected to instance {instance_id}") except Exception as e: - print(f"Failed to connect to instance {instance_id}: {str(e)}") + LOGGER.error(f"Failed to connect to instance {instance_id}: {str(e)}") raise artifact_folder = f"vllm-{TEST_ID}-folder" @@ -195,13 +195,7 @@ def delete_s3_artifact_copy(): master_connection.run( f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) - print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") - - # Debug: check dir structure - print("=== DEBUG: Master instance dir structure ===") - print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:") - master_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/") - print("=== END DEBUG ===\n") + LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") master_connection.run( f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" @@ -212,13 +206,7 @@ def delete_s3_artifact_copy(): worker_connection.run( f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) - print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") - - # Debug: Check directory structure - print("=== DEBUG: Worker instance directory structure ===") - print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:") - worker_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/") - print("=== END DEBUG ===\n") + LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") worker_connection.run( f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" @@ -277,7 +265,7 @@ def efa_ec2_instances( try: ec2_key_name = f"{ec2_key_name}-{TEST_ID}" - print(f"Creating instance: CI-CD {ec2_key_name}") + LOGGER.info(f"Creating instance: CI-CD {ec2_key_name}") key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name) volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda" @@ -329,7 +317,7 @@ def efa_ec2_instances( check_system_state( master_instance_id, system_status="ok", instance_status="ok", region=region ) - print(f"Master instance {master_instance_id} is ready") + LOGGER.info(f"Master instance {master_instance_id} is ready") create_name_tags_for_instance(master_instance_id, f"{instance_name_prefix}_master", region) if is_efa: for i in range(1, len(instances)): @@ -341,13 +329,13 @@ def efa_ec2_instances( check_system_state( worker_instance_id, system_status="ok", instance_status="ok", region=region ) - print(f"Worker instance {worker_instance_id} is ready") + LOGGER.info(f"Worker instance {worker_instance_id} is ready") num_efa_interfaces = get_num_efa_interfaces_for_instance_type( ec2_instance_type, region=region ) - print(num_efa_interfaces) + LOGGER.info(num_efa_interfaces) if num_efa_interfaces > 1: for instance in instances: @@ -372,17 +360,17 @@ def efa_ec2_instances( "elastic_ips": elastic_ip_allocation_ids, "connection_params": connection_params, } - print("Launched EFA Test instances") + LOGGER.info("Launched EFA Test instances") return return_val except Exception as e: - print(f"Error in efa_ec2_instances: {str(e)}") + LOGGER.error(f"Error in efa_ec2_instances: {str(e)}") # Clean up elastic IPs if elastic_ip_allocation_ids: try: delete_elastic_ips(elastic_ip_allocation_ids, ec2_client) except Exception as cleanup_error: - print(f"Error cleaning up elastic IPs: {str(cleanup_error)}") + LOGGER.error(f"Error cleaning up elastic IPs: {str(cleanup_error)}") # Clean up instances if instances: @@ -393,7 +381,7 @@ def efa_ec2_instances( waiter = ec2_client.get_waiter("instance_terminated") waiter.wait(InstanceIds=instance_ids) except Exception as cleanup_error: - print(f"Error terminating instances: {str(cleanup_error)}") + LOGGER.error(f"Error terminating instances: {str(cleanup_error)}") # Clean up key pair if key_filename: @@ -403,7 +391,7 @@ def efa_ec2_instances( if os.path.exists(f"{key_filename}.pub"): os.remove(f"{key_filename}.pub") except Exception as cleanup_error: - print(f"Error cleaning up key files: {str(cleanup_error)}") + LOGGER.error(f"Error cleaning up key files: {str(cleanup_error)}") raise @@ -437,13 +425,13 @@ def wait_for_instances(instance_ids): waiter.wait(InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 100}) return True except WaiterError as e: - print(f"Warning: Instance termination waiter timed out: {str(e)}") + LOGGER.error(f"Warning: Instance termination waiter timed out: {str(e)}") return False if resources.get("elastic_ips"): try: delete_elastic_ips(resources["elastic_ips"], ec2_cli) - print(f"Deleted elastic IPs: {resources['elastic_ips']}") + LOGGER.error(f"Deleted elastic IPs: {resources['elastic_ips']}") except Exception as e: cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}") @@ -451,7 +439,7 @@ def wait_for_instances(instance_ids): try: instance_ids = [instance_id for instance_id, _ in resources["instances_info"]] ec2_cli.terminate_instances(InstanceIds=instance_ids) - print(f"Terminating instances: {instance_ids}") + LOGGER.info(f"Terminating instances: {instance_ids}") if not wait_for_instances(instance_ids): cleanup_errors.append("Instances did not terminate within expected timeframe") @@ -472,7 +460,7 @@ def wait_for_instances(instance_ids): if resources.get("fsx_config"): try: fsx.delete_fsx_filesystem(resources["fsx_config"]["filesystem_id"]) - print(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}") + LOGGER.info(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}") except Exception as e: cleanup_errors.append(f"Failed to delete FSx filesystem: {str(e)}") @@ -483,7 +471,7 @@ def wait_for_instances(instance_ids): for attempt in range(max_attempts): try: ec2_cli.delete_security_group(GroupId=resources["sg_fsx"]) - print(f"Deleted security group: {resources['sg_fsx']}") + LOGGER.info(f"Deleted security group: {resources['sg_fsx']}") break except Exception as e: if attempt == max_attempts - 1: @@ -491,7 +479,7 @@ def wait_for_instances(instance_ids): f"Failed to delete security group after {max_attempts} attempts: {str(e)}" ) else: - print(f"Retry {attempt + 1}/{max_attempts} to delete security group") + LOGGER.info(f"Retry {attempt + 1}/{max_attempts} to delete security group") time.sleep(30) if cleanup_errors: @@ -515,7 +503,7 @@ def launch_ec2_instances(ec2_cli, image): availability_zone_options=az_options, is_arm64=is_arm64, ) - print(f"Launched instances: {instances_info}") + LOGGER.info(f"Launched instances: {instances_info}") return instances_info @@ -541,7 +529,7 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info) fsx_name, "Security group for FSx Lustre VLLM EC2 Tests", ) - print(f"Created FSx security group: {sg_fsx}") + LOGGER.info(f"Created FSx security group: {sg_fsx}") # Get instance IDs from instances_info instance_ids = [instance_id for instance_id, _ in instances_info] @@ -552,7 +540,7 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info) return sg_fsx except Exception as e: - print(f"Error configuring security groups: {str(e)}") + LOGGER.error(f"Error configuring security groups: {str(e)}") raise @@ -603,7 +591,7 @@ def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_ def setup(image): """Main setup function for VLLM on EC2 with FSx""" - print("Testing vllm on ec2........") + LOGGER.info("Testing vllm on ec2........") fsx = FsxSetup(DEFAULT_REGION) ec2_cli = get_ec2_client(DEFAULT_REGION) resources = {"instances_info": None, "fsx_config": None, "sg_fsx": None} @@ -616,7 +604,7 @@ def setup(image): resources["instances_info"] = instance_result["instances"] resources["elastic_ips"] = instance_result["elastic_ips"] resources["connection_params"] = instance_result["connection_params"] - print("Waiting 60 seconds for instances to initialize...") + LOGGER.info("Waiting 60 seconds for instances to initialize...") time.sleep(60) instance_ids = [instance_id for instance_id, _ in resources["instances_info"]] @@ -632,7 +620,7 @@ def setup(image): "SCRATCH_2", {"Name": f"fsx-lustre-vllm-ec2-test-{instance_ids[0]}-{TEST_ID}"}, ) - print("Created FSx filesystem") + LOGGER.info("Created FSx filesystem") master_instance_id, master_key_filename = resources["instances_info"][0] setup_instance( @@ -642,7 +630,7 @@ def setup(image): resources["fsx_config"]["dns_name"], resources["fsx_config"]["mount_name"], ) - print(f"Setup completed for master instance {master_instance_id}") + LOGGER.info(f"Setup completed for master instance {master_instance_id}") if len(resources["instances_info"]) > 1: worker_instance_id, worker_key_filename = resources["instances_info"][1] @@ -653,12 +641,12 @@ def setup(image): resources["fsx_config"]["dns_name"], resources["fsx_config"]["mount_name"], ) - print(f"FSx mounted on worker instance {worker_instance_id}") + LOGGER.info(f"FSx mounted on worker instance {worker_instance_id}") return resources except Exception as e: - print(f"Error during setup: {str(e)}") + LOGGER.error(f"Error during setup: {str(e)}") cleanup_resources(ec2_cli, resources, fsx) raise diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index cb1a8bd2be1f..8c26e32e6271 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -214,15 +214,15 @@ def _setup_multinode_efa_instances( build_all_reduce_perf_promises = [] # Run container _setup_container(master_connection, image, master_container_name) - - # Verify files are visible inside container - print(f"Verifying files inside {master_container_name} container...") - run_cmd_on_container( - master_container_name, - master_connection, - "ls -la /test/v2/ec2/efa/", - hide=False, - ) + + # Uncomment to verify container file structure in case of path issues + # LOGGER.info(f"Verifying files inside {master_container_name} container") + # run_cmd_on_container( + # master_container_name, + # master_connection, + # "ls -la /test/v2/ec2/efa/", + # hide=False, + # ) # Build all_reduce_perf binary using nccl-tests promise = run_cmd_on_container( diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index c66376cdcb06..be6982188cdf 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -92,10 +92,10 @@ def wait_for_container_ready(connection, container_name, timeout: int = 1000) -> """ result = connection.run(curl_cmd, hide=False) if result.ok: - print("Model endpoint is responding") - print("\n=== Complete vLLM Server Log ===") + LOGGER.info("Model endpoint is responding") + LOGGER.info("\n=== Complete vLLM Server Log ===") connection.run(f"docker exec {container_name} cat vllm.log", hide=False) - print("=== End of Log ===\n") + LOGGER.info("=== End of Log ===\n") model_ready = True return True except Exception: @@ -106,7 +106,7 @@ def wait_for_container_ready(connection, container_name, timeout: int = 1000) -> def setup_docker_image(conn, image_uri): account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(conn, account_id, DEFAULT_REGION) - print(f"Pulling image: {image_uri}") + LOGGER.info(f"Pulling image: {image_uri}") conn.run(f"docker pull {image_uri}", hide="out") @@ -135,7 +135,7 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_ worker_ip = worker_connection.run("hostname -i").stdout.strip() container_name = "ray_head-" + TEST_ID - print("Starting head node...") + LOGGER.info("Starting head node...") head_connection.run( f"./head_node_setup.sh {image_uri} {hf_token} {head_ip} {container_name}" ) @@ -154,11 +154,11 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_ f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'" ) - print("Waiting for model to be ready, approx estimated time to complete is 15 mins...") + LOGGER.info("Waiting for model to be ready, approx estimated time to complete is 15 mins...") if not wait_for_container_ready(head_connection, container_name, timeout=2000): raise Exception("Container failed to become ready within timeout period") - print("Running benchmark...") + LOGGER.info("Running benchmark...") benchmark_cmd = "source vllm_env/bin/activate &&" + create_benchmark_command() benchmark_result = head_connection.run(benchmark_cmd, timeout=7200) @@ -182,19 +182,19 @@ def verify_gpu_setup(connection): # Check nvidia-smi result = connection.run("nvidia-smi", hide=True) if result.failed: - print("nvidia-smi check failed") + LOGGER.info("nvidia-smi check failed") return False # Check CUDA availability cuda_check = connection.run("nvidia-smi -L", hide=True) if cuda_check.failed or "GPU" not in cuda_check.stdout: - print("No GPUs found") + LOGGER.info("No GPUs found") return False return True except Exception as e: - print(f"GPU verification failed: {str(e)}") + LOGGER.info(f"GPU verification failed: {str(e)}") return False @@ -206,7 +206,7 @@ def cleanup_containers(connection): connection: Fabric connection object """ try: - print("Cleaning up containers and images...") + LOGGER.info("Cleaning up containers and images...") commands = [ "docker ps -aq | xargs -r docker stop", "docker ps -aq | xargs -r docker rm", @@ -214,7 +214,7 @@ def cleanup_containers(connection): for cmd in commands: connection.run(cmd, hide=True, warn=True) except Exception as e: - print(f"Cleanup warning: {str(e)}") + LOGGER.error(f"Cleanup warning: {str(e)}") def run_multi_node_test(head_conn, worker_conn, image_uri): @@ -227,7 +227,7 @@ def run_multi_node_test(head_conn, worker_conn, image_uri): image_uri: ECR image URI """ - print("\n=== Starting Multi-Node Test ===") + LOGGER.info("\n=== Starting Multi-Node Test ===") verification_tasks = [(head_conn, "head"), (worker_conn, "worker")] for conn, node_type in verification_tasks: if not verify_gpu_setup(conn): @@ -235,7 +235,7 @@ def run_multi_node_test(head_conn, worker_conn, image_uri): result = test_vllm_benchmark_on_multi_node(head_conn, worker_conn, image_uri) if result.ok: - print("Multi-node test completed successfully") + LOGGER.info("Multi-node test completed successfully") return True return False @@ -275,11 +275,11 @@ def run_single_node_test(head_conn, image_uri): ) except Exception as e: - print(f"Test execution failed: {str(e)}") + LOGGER.error(f"Test execution failed: {str(e)}") raise if result.ok: - print("Single-node test completed successfully") + LOGGER.info("Single-node test completed successfully") return True @@ -315,7 +315,7 @@ def test_vllm_on_ec2(resources, image_uri): # Recreate connections from stored parameters if available, otherwise create new ones if "connection_params" in resources and resources["connection_params"]: - print("Recreating connections from stored parameters") + LOGGER.info("Recreating connections from stored parameters") # Recreate fresh Connection objects from parameters stored during setup_test_artifacts() for params in resources["connection_params"]: try: @@ -325,12 +325,12 @@ def test_vllm_on_ec2(resources, image_uri): connect_kwargs={"key_filename": params["key_filename"]}, ) ec2_connections[params["instance_id"]] = connection - print(f"Recreated connection to instance {params['instance_id']}") + LOGGER.info(f"Recreated connection to instance {params['instance_id']}") except Exception as e: - print(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}") + LOGGER.error(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}") raise else: - print("Creating new connections to instances") + LOGGER.info("Creating new connections to instances") for instance_id, key_filename in resources["instances_info"]: try: instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[ @@ -350,16 +350,16 @@ def test_vllm_on_ec2(resources, image_uri): ec2_connections[instance_id] = connection except Exception as e: - print(f"Failed to connect to instance {instance_id}: {str(e)}") + LOGGER.error(f"Failed to connect to instance {instance_id}: {str(e)}") raise # Verify all connections are working for instance_id, conn in ec2_connections.items(): try: conn.run('echo "Connection test"', hide=True) - print(f"Successfully verified connection to instance {instance_id}") + LOGGER.info(f"Successfully verified connection to instance {instance_id}") except Exception as e: - print(f"Connection test failed for instance {instance_id}: {str(e)}") + LOGGER.error(f"Connection test failed for instance {instance_id}: {str(e)}") raise is_arm64 = "arm64" in image_uri @@ -367,27 +367,16 @@ def test_vllm_on_ec2(resources, image_uri): head_conn = ec2_connections[instance_ids[0]] if is_arm64: - print("\n=== Starting ARM64 Single Node Test ===") + LOGGER.info("\n=== Starting ARM64 Single Node Test ===") test_results["single_node"] = run_single_node_test(head_conn, image_uri) - print( + LOGGER.info( f"ARM64 Single node test: {'Passed' if test_results['single_node'] else 'Failed'}" ) elif len(ec2_connections) >= 2: worker_conn = ec2_connections[instance_ids[1]] - # Verify test files exist before starting containers - print("\n=== Verifying test files on EC2 instances ===") - for conn_id, conn in ec2_connections.items(): - result = conn.run("ls -la $HOME/test/v2/ec2/efa/", warn=True) - if result.failed: - raise Exception( - f"Test files not found at $HOME/test/v2/ec2/efa/ on instance {conn_id}" - ) - print(f"Instance {conn_id} test files:") - print(result.stdout) - - print("\n=== Starting EFA Tests ===") + LOGGER.info("\n=== Starting EFA Tests ===") _setup_multinode_efa_instances( image_uri, resources["instances_info"][:2], @@ -417,22 +406,22 @@ def test_vllm_on_ec2(resources, image_uri): for conn in [head_conn, worker_conn]: cleanup_containers(conn) - print("EFA tests completed successfully") + LOGGER.info("EFA tests completed successfully") # Run multi-node test test_results["multi_node"] = run_multi_node_test(head_conn, worker_conn, image_uri) else: - print("\nSkipping multi-node test: insufficient instances") + LOGGER.info("\nSkipping multi-node test: insufficient instances") - print("\n=== Test Summary ===") + LOGGER.info("\n=== Test Summary ===") for test_name, result in test_results.items(): if result is not None: - print( + LOGGER.info( f"{test_name.replace('_', ' ').title()} test: {'Passed' if result else 'Failed'}" ) else: - print(f"{test_name.replace('_', ' ').title()} test: Not Run") + LOGGER.info(f"{test_name.replace('_', ' ').title()} test: Not Run") if is_arm64: if not test_results["single_node"]: @@ -441,5 +430,5 @@ def test_vllm_on_ec2(resources, image_uri): raise Exception("All tests failed") except Exception as e: - print(f"Test execution failed: {str(e)}") + LOGGER.error(f"Test execution failed: {str(e)}") raise From fa34264fd901d8c4a633bf559833c92c76815fc9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 14:58:13 -0700 Subject: [PATCH 24/33] formatting --- infra/test_infra/ec2/vllm/setup_ec2.py | 18 ++++++++++-------- test/test_utils/__init__.py | 2 +- test/v2/ec2/efa/test_efa.py | 4 ++-- test/v2/ec2/vllm/test_ec2.py | 8 ++++++-- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 2f08600b4a90..57d2227fd788 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -164,12 +164,14 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region): ec2_connections[instance_id] = connection # Store connection parameters for later recreation - connection_params.append({ - "instance_id": instance_id, - "host": public_ip, - "user": "ec2-user", - "key_filename": key_filename - }) + connection_params.append( + { + "instance_id": instance_id, + "host": public_ip, + "user": "ec2-user", + "key_filename": key_filename, + } + ) if not master_connection: master_connection = connection @@ -196,7 +198,7 @@ def delete_s3_artifact_copy(): f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master") - + master_connection.run( f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" ) @@ -207,7 +209,7 @@ def delete_s3_artifact_copy(): f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}" ) LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker") - + worker_connection.run( f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*" ) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 54461488a75f..198050e8823a 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -1309,7 +1309,7 @@ def upload_tests_to_s3(testname_datetime_suffix): raise EnvironmentError("Test is being run from wrong path") while os.path.basename(path) != "dlc_tests": path = os.path.dirname(path) - + # If if new test structure is enabled, upload only v2 directory for new test structure if is_new_test_structure_enabled(): v2_path = os.path.join(os.path.dirname(path), "v2") diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 8c26e32e6271..062259093b94 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -214,7 +214,7 @@ def _setup_multinode_efa_instances( build_all_reduce_perf_promises = [] # Run container _setup_container(master_connection, image, master_container_name) - + # Uncomment to verify container file structure in case of path issues # LOGGER.info(f"Verifying files inside {master_container_name} container") # run_cmd_on_container( @@ -333,7 +333,7 @@ def _setup_container(connection, docker_image, container_name): f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash" ) - + LOGGER.info(f"Container {container_name} started successfully") diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index be6982188cdf..04de0d70028b 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -154,7 +154,9 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_ f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'" ) - LOGGER.info("Waiting for model to be ready, approx estimated time to complete is 15 mins...") + LOGGER.info( + "Waiting for model to be ready, approx estimated time to complete is 15 mins..." + ) if not wait_for_container_ready(head_connection, container_name, timeout=2000): raise Exception("Container failed to become ready within timeout period") @@ -327,7 +329,9 @@ def test_vllm_on_ec2(resources, image_uri): ec2_connections[params["instance_id"]] = connection LOGGER.info(f"Recreated connection to instance {params['instance_id']}") except Exception as e: - LOGGER.error(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}") + LOGGER.error( + f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}" + ) raise else: LOGGER.info("Creating new connections to instances") From bbcc859b37ab4197606c7a036020215deae97c5f Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 15:03:43 -0700 Subject: [PATCH 25/33] Clean up unused imports --- infra/test_infra/ec2/utils.py | 1 - infra/test_infra/ec2/vllm/fsx_utils.py | 3 +-- .../test_infra/validators/base_platform_validator.py | 3 +-- test/v2/ec2/efa/test_efa.py | 12 +----------- test/v2/ec2/vllm/test_ec2.py | 1 - 5 files changed, 3 insertions(+), 17 deletions(-) diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py index ec3204d46ec0..30c8430b7ba5 100644 --- a/infra/test_infra/ec2/utils.py +++ b/infra/test_infra/ec2/utils.py @@ -1,7 +1,6 @@ import os import time import re -import sys import uuid import copy diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py index a7b1b0dc4446..eed3635550b1 100644 --- a/infra/test_infra/ec2/vllm/fsx_utils.py +++ b/infra/test_infra/ec2/vllm/fsx_utils.py @@ -1,7 +1,6 @@ import time from invoke import run -from typing import Dict, List, Any -import boto3 +from typing import Dict, List from botocore.exceptions import ClientError from infra.test_infra.test_infra_utils import create_logger diff --git a/infra/test_infra/validators/base_platform_validator.py b/infra/test_infra/validators/base_platform_validator.py index 3f78e00f57d3..3353c721ef22 100644 --- a/infra/test_infra/validators/base_platform_validator.py +++ b/infra/test_infra/validators/base_platform_validator.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import List, Dict, Type -from dataclasses import dataclass +from typing import List, Dict class BasePlatformValidator(ABC): diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 062259093b94..43ef77c563d8 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -214,16 +214,6 @@ def _setup_multinode_efa_instances( build_all_reduce_perf_promises = [] # Run container _setup_container(master_connection, image, master_container_name) - - # Uncomment to verify container file structure in case of path issues - # LOGGER.info(f"Verifying files inside {master_container_name} container") - # run_cmd_on_container( - # master_container_name, - # master_connection, - # "ls -la /test/v2/ec2/efa/", - # hide=False, - # ) - # Build all_reduce_perf binary using nccl-tests promise = run_cmd_on_container( master_container_name, @@ -333,7 +323,7 @@ def _setup_container(connection, docker_image, container_name): f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash" ) - + LOGGER.info(f"Container {container_name} started successfully") diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py index 04de0d70028b..54e2cfb12005 100644 --- a/test/v2/ec2/vllm/test_ec2.py +++ b/test/v2/ec2/vllm/test_ec2.py @@ -16,7 +16,6 @@ from test.v2.ec2.efa.test_efa import ( _setup_multinode_efa_instances, EFA_SANITY_TEST_CMD, - MASTER_CONTAINER_NAME, HOSTS_FILE_LOCATION, EFA_INTEGRATION_TEST_CMD, DEFAULT_EFA_TIMEOUT, From 22b31bfe227318faf8ab76bb0038af0614334d37 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 15:16:15 -0700 Subject: [PATCH 26/33] remove unused code for vllm --- test/v2/ec2/efa/test_efa.py | 112 ------------------------------------ 1 file changed, 112 deletions(-) diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 43ef77c563d8..88a1397b19f3 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -1,21 +1,12 @@ import os - -import pytest - import test.test_utils.ec2 as ec2_utils from test.test_utils import ( CONTAINER_TESTS_PREFIX_V2, get_account_id_from_image_uri, get_region_from_image_uri, - is_pr_context, - is_efa_dedicated, - are_heavy_instance_ec2_tests_enabled, login_to_ecr_registry, run_cmd_on_container, ) -from packaging.version import Version -from packaging.specifiers import SpecifierSet - from infra.test_infra.ec2.utils import ( get_efa_ec2_instance_type, filter_efa_instance_type, @@ -78,109 +69,6 @@ def get_efa_container_name(framework, test_scenario, arch_type, node_role=None): ) -# TODO: decide on whether to keep this commented out or left out until actual implementation of each framework -# def test_pytorch_efa( -# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only -# ): -# """ -# Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA -# on multiple nodes using DLC images. The test scripts are agnostic to the framework and version -# installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf -# binary necessary for multinode tests, on each node. -# Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances -# on pipelines. -# :param pytorch_training: str PyTorch Training DLC image URI -# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances -# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances -# :param ec2_instance_type: str Instance Type being tested -# :param region: str Region in which EFA-enabled instances are launched -# :param gpu_only: pytest fixture to limit test only to GPU DLCs -# """ -# number_of_nodes = 2 -# _setup_multinode_efa_instances( -# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region -# ) -# master_connection = efa_ec2_connections[0] -# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) - -# ipv6_arg = "True" if ENABLE_IPV6_TESTING else "" - -# run_cmd_on_container( -# MASTER_CONTAINER_NAME, -# master_connection, -# f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}", -# hide=False, -# timeout=DEFAULT_EFA_TIMEOUT, -# ) - - -# def test_efa_tensorflow( -# tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only -# ): -# """ -# Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA -# on multiple nodes using DLC images. The test scripts are agnostic to the framework and version -# installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf -# binary necessary for multinode tests, on each node. -# Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances -# on pipelines. -# :param tensorflow_training: str PyTorch Training DLC image URI -# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances -# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances -# :param ec2_instance_type: str Instance Type being tested -# :param region: str Region in which EFA-enabled instances are launched -# :param gpu_only: pytest fixture to limit test only to GPU DLCs -# """ -# number_of_nodes = 2 -# _setup_multinode_efa_instances( -# tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region -# ) -# master_connection = efa_ec2_connections[0] -# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) - -# # pass IPv6 flag if enabled -# ipv6_arg = "True" if ENABLE_IPV6_TESTING else "" - -# run_cmd_on_container( -# MASTER_CONTAINER_NAME, -# master_connection, -# f"export CUDA_HOME='/usr/local/cuda'; {EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}", -# hide=False, -# timeout=DEFAULT_EFA_TIMEOUT, -# ) - - -# def test_pytorch_efa_healthcheck( -# pytorch_training, -# efa_ec2_instances, -# efa_ec2_connections, -# ec2_instance_type, -# region, -# gpu_only, -# ): -# """ -# Run EFA Health Check tests on DLC. -# :param pytorch_training: str PyTorch Training DLC image URI -# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances -# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances -# :param ec2_instance_type: str Instance Type being tested -# :param region: str Region in which EFA-enabled instances are launched -# :param gpu_only: pytest fixture to limit test only to GPU DLCs -# """ -# _setup_multinode_efa_instances( -# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region -# ) -# master_connection = efa_ec2_connections[0] -# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False) -# run_cmd_on_container( -# MASTER_CONTAINER_NAME, -# master_connection, -# f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}", -# hide=False, -# timeout=DEFAULT_EFA_TIMEOUT, -# ) - - def _setup_multinode_efa_instances( image, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, arch_type=None ): From 6c0103258ef57e4fbe001063f1c83ca689967cf6 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 15:17:17 -0700 Subject: [PATCH 27/33] rerun ec2 and eks using new path --- vllm/buildspec.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index e79fe18ff8f8..a007d441998e 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -59,9 +59,9 @@ images: run: - python test/v2/ec2/vllm/test_ec2.py - # - platform: eks - # params: - # cluster: dlc-vllm - # namespace: vllm - # run: - # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + - platform: eks + params: + cluster: dlc-vllm + namespace: vllm + run: + - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file From c652b7669c5cae410df013d727aef548443e34c5 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 17:12:03 -0700 Subject: [PATCH 28/33] change logger level --- infra/test_infra/ec2/vllm/setup_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 57d2227fd788..29382bb7bba4 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -433,7 +433,7 @@ def wait_for_instances(instance_ids): if resources.get("elastic_ips"): try: delete_elastic_ips(resources["elastic_ips"], ec2_cli) - LOGGER.error(f"Deleted elastic IPs: {resources['elastic_ips']}") + LOGGER.info(f"Deleted elastic IPs: {resources['elastic_ips']}") except Exception as e: cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}") From 2680d2f5ad69ba5d45be5f0afc09c6d1644cd94d Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 17:20:44 -0700 Subject: [PATCH 29/33] fix the hardcoded test command by adding test registry --- infra/test_infra/ec2/setup.py | 37 ++++++++++++++++++++++++++--------- vllm/buildspec.yml | 12 ++++++------ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index 102563cd2519..70c36fc9b8ba 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -6,6 +6,17 @@ LOGGER = create_logger(__name__) +# Registry for framework-specific test execution +# Frameworks requiring direct Python function calls with resources should be registered here +TEST_REGISTRY = { + "vllm": { + "module": "test.v2.ec2.vllm.test_ec2", + "function": "test_vllm_on_ec2", + "requires_resources": True, + }, + # Future frameworks to be added here +} + class EC2Platform: def __init__(self): @@ -54,17 +65,25 @@ def execute_command(self, cmd): "FRAMEWORK": self.framework, } - # Check if this is a vLLM test command - # TODO: check if there is a better way to handle this - if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd: - LOGGER.info(f"Executing vLLM test via direct call: {cmd}") - from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2 - - # Pass resources and image_uri; test reads config from env vars - test_vllm_on_ec2(self.resources, self.image_uri) + # Check registry for framework-specific handling + test_config = TEST_REGISTRY.get(self.framework) + + if test_config and test_config.get("requires_resources"): + # Direct Python function call for tests requiring resource access + LOGGER.info(f"Executing {self.framework} test via direct call: {cmd}") + + module_path = test_config["module"] + function_name = test_config["function"] + + # Dynamically import and call the test function + module = __import__(module_path, fromlist=[function_name]) + test_function = getattr(module, function_name) + + # Pass resources and image_uri directly + test_function(self.resources, self.image_uri) LOGGER.info(f"Command completed successfully: {cmd}") else: - # Standard shell command execution for other cases + # Standard shell command execution repo_root = get_cloned_folder_path() with self.ctx.cd(repo_root): diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index a007d441998e..e79fe18ff8f8 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -59,9 +59,9 @@ images: run: - python test/v2/ec2/vllm/test_ec2.py - - platform: eks - params: - cluster: dlc-vllm - namespace: vllm - run: - - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + # - platform: eks + # params: + # cluster: dlc-vllm + # namespace: vllm + # run: + # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file From eea38330a5c48f3ca190e83008aea41dd4725afe Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Mon, 20 Oct 2025 22:42:05 -0700 Subject: [PATCH 30/33] test current test path to confirm still working correctly --- dlc_developer_config.toml | 2 +- vllm/buildspec.yml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 6bfe70c08d6d..e433e779db14 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -58,7 +58,7 @@ notify_test_failures = false [test] # Set to true to use the new test structure path for frameworks # Off by default (set to false) -use_new_test_structure = true +use_new_test_structure = false ### On by default sanity_tests = true diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index e79fe18ff8f8..a007d441998e 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -59,9 +59,9 @@ images: run: - python test/v2/ec2/vllm/test_ec2.py - # - platform: eks - # params: - # cluster: dlc-vllm - # namespace: vllm - # run: - # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + - platform: eks + params: + cluster: dlc-vllm + namespace: vllm + run: + - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file From 219e27cf9e678124969f18b14da5e23133722331 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 21 Oct 2025 00:16:01 -0700 Subject: [PATCH 31/33] test with added instance type parsing check against assigned instance type --- dlc_developer_config.toml | 2 +- infra/test_infra/ec2/setup.py | 21 +++++++++++++++++++++ vllm/buildspec.yml | 14 +++++++------- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e433e779db14..6bfe70c08d6d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -58,7 +58,7 @@ notify_test_failures = false [test] # Set to true to use the new test structure path for frameworks # Off by default (set to false) -use_new_test_structure = false +use_new_test_structure = true ### On by default sanity_tests = true diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index 70c36fc9b8ba..46903b9ebad9 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -37,8 +37,29 @@ def setup(self, params): self.framework = params.get("framework") self.arch_type = params.get("arch_type", "x86_64") self.image_uri = params.get("image_uri") + self.instance_type = params.get("instance_type") + self.node_count = params.get("node_count") if self.framework == "vllm": + # Validate buildspec params match hardcoded logic for vLLM + is_arm64 = "arm64" in self.image_uri + expected_instance_type = "g5g.16xlarge" if is_arm64 else "p4d.24xlarge" + expected_node_count = 1 if is_arm64 else 2 + + if self.instance_type and self.instance_type != expected_instance_type: + LOGGER.warning( + f"Buildspec instance_type '{self.instance_type}' differs from " + f"hardcoded value '{expected_instance_type}'. Using hardcoded value." + ) + + # Note: The platform validator already enforces node_count == 2 for multi-node EFA tests, + # so the node_count check below would only trigger if the validator is bypassed or modified + if self.node_count and self.node_count != expected_node_count: + LOGGER.warning( + f"Buildspec node_count '{self.node_count}' differs from " + f"hardcoded value '{expected_node_count}'. Using hardcoded value." + ) + # vLLM requires vLLM-specific setup (FSx + multi-node) LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}") from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index a007d441998e..efa0c07b97cf 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -54,14 +54,14 @@ images: tests: - platform: ec2-multi-node-efa params: - instance_type: p4d.24xlarge + instance_type: p5.48xlarge node_count: 2 run: - python test/v2/ec2/vllm/test_ec2.py - - platform: eks - params: - cluster: dlc-vllm - namespace: vllm - run: - - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + # - platform: eks + # params: + # cluster: dlc-vllm + # namespace: vllm + # run: + # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file From 842453e7a3fa285e861f0396b379778585904bd9 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 21 Oct 2025 14:43:54 -0700 Subject: [PATCH 32/33] formatting and clean up code, rerun test with unexpected instance type --- infra/test_infra/ec2/setup.py | 18 +++++++++--------- infra/test_infra/ec2/vllm/setup_ec2.py | 20 ++++++++++---------- test/v2/ec2/efa/test_efa.py | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py index 46903b9ebad9..26cdb652dad9 100644 --- a/infra/test_infra/ec2/setup.py +++ b/infra/test_infra/ec2/setup.py @@ -42,16 +42,16 @@ def setup(self, params): if self.framework == "vllm": # Validate buildspec params match hardcoded logic for vLLM - is_arm64 = "arm64" in self.image_uri + is_arm64 = self.arch_type == "arm64" expected_instance_type = "g5g.16xlarge" if is_arm64 else "p4d.24xlarge" expected_node_count = 1 if is_arm64 else 2 - + if self.instance_type and self.instance_type != expected_instance_type: LOGGER.warning( f"Buildspec instance_type '{self.instance_type}' differs from " f"hardcoded value '{expected_instance_type}'. Using hardcoded value." ) - + # Note: The platform validator already enforces node_count == 2 for multi-node EFA tests, # so the node_count check below would only trigger if the validator is bypassed or modified if self.node_count and self.node_count != expected_node_count: @@ -59,12 +59,12 @@ def setup(self, params): f"Buildspec node_count '{self.node_count}' differs from " f"hardcoded value '{expected_node_count}'. Using hardcoded value." ) - + # vLLM requires vLLM-specific setup (FSx + multi-node) LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}") from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup - self.resources = vllm_setup(self.image_uri) + self.resources = vllm_setup(self.image_uri, self.arch_type) LOGGER.info("vLLM setup completed successfully") else: # standard EC2 setup for other frameworks @@ -88,18 +88,18 @@ def execute_command(self, cmd): # Check registry for framework-specific handling test_config = TEST_REGISTRY.get(self.framework) - + if test_config and test_config.get("requires_resources"): # Direct Python function call for tests requiring resource access LOGGER.info(f"Executing {self.framework} test via direct call: {cmd}") - + module_path = test_config["module"] function_name = test_config["function"] - + # Dynamically import and call the test function module = __import__(module_path, fromlist=[function_name]) test_function = getattr(module, function_name) - + # Pass resources and image_uri directly test_function(self.resources, self.image_uri) LOGGER.info(f"Command completed successfully: {cmd}") diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py index 29382bb7bba4..e45ace2b1be6 100644 --- a/infra/test_infra/ec2/vllm/setup_ec2.py +++ b/infra/test_infra/ec2/vllm/setup_ec2.py @@ -45,15 +45,15 @@ TEST_ID = str(uuid.uuid4()) -def ec2_instance_ami(region, image): - if "arm64" in image: +def ec2_instance_ami(region, arch_type): + if arch_type == "arm64": return AL2023_BASE_DLAMI_ARM64_US_WEST_2 return test_utils.get_dlami_id(region) -def ec2_instance_type(image): - if "arm64" in image: +def ec2_instance_type(arch_type): + if arch_type == "arm64": return "g5g.16xlarge" else: return "p4d.24xlarge" @@ -488,12 +488,12 @@ def wait_for_instances(instance_ids): raise Exception("Cleanup errors occurred:\n" + "\n".join(cleanup_errors)) -def launch_ec2_instances(ec2_cli, image): +def launch_ec2_instances(ec2_cli, arch_type): """Launch EC2 instances with EFA support""" - instance_type = ec2_instance_type(image) - ami_id = ec2_instance_ami(DEFAULT_REGION, image) + instance_type = ec2_instance_type(arch_type) + ami_id = ec2_instance_ami(DEFAULT_REGION, arch_type) az_options = availability_zone_options(ec2_cli, instance_type, DEFAULT_REGION) - is_arm64 = True if "arm64" in image else False + is_arm64 = arch_type == "arm64" instances_info = efa_ec2_instances( ec2_client=ec2_cli, @@ -591,7 +591,7 @@ def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_ connection.run(cmd) -def setup(image): +def setup(image, arch_type): """Main setup function for VLLM on EC2 with FSx""" LOGGER.info("Testing vllm on ec2........") fsx = FsxSetup(DEFAULT_REGION) @@ -602,7 +602,7 @@ def setup(image): vpc_id = get_default_vpc_id(ec2_cli) subnet_ids = get_subnet_id_by_vpc(ec2_cli, vpc_id) - instance_result = launch_ec2_instances(ec2_cli, image) + instance_result = launch_ec2_instances(ec2_cli, arch_type) resources["instances_info"] = instance_result["instances"] resources["elastic_ips"] = instance_result["elastic_ips"] resources["connection_params"] = instance_result["connection_params"] diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py index 88a1397b19f3..e34fcdb604df 100644 --- a/test/v2/ec2/efa/test_efa.py +++ b/test/v2/ec2/efa/test_efa.py @@ -211,7 +211,7 @@ def _setup_container(connection, docker_image, container_name): f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 " f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash" ) - + LOGGER.info(f"Container {container_name} started successfully") From 8ed0f904b1f8fe3b35a20cd63eb98b579513b7e8 Mon Sep 17 00:00:00 2001 From: Jinyan Li Date: Tue, 21 Oct 2025 16:12:11 -0700 Subject: [PATCH 33/33] Revert config changes --- dlc_developer_config.toml | 2 +- vllm/buildspec.yml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 6bfe70c08d6d..e433e779db14 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -58,7 +58,7 @@ notify_test_failures = false [test] # Set to true to use the new test structure path for frameworks # Off by default (set to false) -use_new_test_structure = true +use_new_test_structure = false ### On by default sanity_tests = true diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index efa0c07b97cf..1939af0be055 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -54,14 +54,14 @@ images: tests: - platform: ec2-multi-node-efa params: - instance_type: p5.48xlarge + instance_type: p4d.24xlarge node_count: 2 run: - python test/v2/ec2/vllm/test_ec2.py # - platform: eks - # params: - # cluster: dlc-vllm - # namespace: vllm - # run: - # - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file + params: + cluster: dlc-vllm + namespace: vllm + run: + - python test/v2/eks/vllm/vllm_eks_test.py \ No newline at end of file