From 1a5610c17ecba5cd159b191a5772ae306481fd55 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 13 Oct 2025 14:45:15 -0700
Subject: [PATCH 01/33] rename .infra to infra and clean up imports

---
 {.infra => infra}/__init__.py                          |  0
 {.infra => infra}/test_infra/__init__.py               |  0
 {.infra => infra}/test_infra/ec2/__init__.py           |  0
 {.infra => infra}/test_infra/ec2/setup.py              |  6 +-----
 {.infra => infra}/test_infra/eks/__init__.py           |  0
 .../test_infra/eks/multinode_heavy/__init__.py         |  0
 .../test_infra/eks/multinode_heavy/eks-cluster.yaml    |  0
 .../test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml  |  0
 .../test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml |  0
 .../eks/multinode_heavy/fsx-storage-class.yaml         |  0
 .../eks/multinode_heavy/large-model-nodegroup.yaml     |  0
 {.infra => infra}/test_infra/eks/setup.py              |  6 +-----
 {.infra => infra}/test_infra/entrypoint.py             |  9 +++------
 {.infra => infra}/test_infra/test_infra_utils.py       |  5 +----
 {.infra => infra}/test_infra/validators/__init__.py    |  0
 .../test_infra/validators/base_platform_validator.py   |  0
 .../test_infra/validators/platform_configs.py          |  0
 .../test_infra/validators/platform_validator_utils.py  |  4 ++--
 .../test_infra/validators/platform_validators.py       |  4 ++--
 test/testrunner.py                                     | 10 +---------
 20 files changed, 11 insertions(+), 33 deletions(-)
 rename {.infra => infra}/__init__.py (100%)
 rename {.infra => infra}/test_infra/__init__.py (100%)
 rename {.infra => infra}/test_infra/ec2/__init__.py (100%)
 rename {.infra => infra}/test_infra/ec2/setup.py (93%)
 rename {.infra => infra}/test_infra/eks/__init__.py (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/__init__.py (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/eks-cluster.yaml (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/fsx-storage-class.yaml (100%)
 rename {.infra => infra}/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml (100%)
 rename {.infra => infra}/test_infra/eks/setup.py (91%)
 rename {.infra => infra}/test_infra/entrypoint.py (93%)
 rename {.infra => infra}/test_infra/test_infra_utils.py (96%)
 rename {.infra => infra}/test_infra/validators/__init__.py (100%)
 rename {.infra => infra}/test_infra/validators/base_platform_validator.py (100%)
 rename {.infra => infra}/test_infra/validators/platform_configs.py (100%)
 rename {.infra => infra}/test_infra/validators/platform_validator_utils.py (74%)
 rename {.infra => infra}/test_infra/validators/platform_validators.py (87%)

diff --git a/.infra/__init__.py b/infra/__init__.py
similarity index 100%
rename from .infra/__init__.py
rename to infra/__init__.py
diff --git a/.infra/test_infra/__init__.py b/infra/test_infra/__init__.py
similarity index 100%
rename from .infra/test_infra/__init__.py
rename to infra/test_infra/__init__.py
diff --git a/.infra/test_infra/ec2/__init__.py b/infra/test_infra/ec2/__init__.py
similarity index 100%
rename from .infra/test_infra/ec2/__init__.py
rename to infra/test_infra/ec2/__init__.py
diff --git a/.infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
similarity index 93%
rename from .infra/test_infra/ec2/setup.py
rename to infra/test_infra/ec2/setup.py
index a3761b2f514b..ea4a0084a54c 100644
--- a/.infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -2,11 +2,7 @@
 import sys
 from invoke.context import Context
 from codebuild_environment import get_cloned_folder_path
-
-current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(current_dir)
-
-from test_infra_utils import create_logger
+from infra.test_infra.test_infra_utils import create_logger
 
 LOGGER = create_logger(__name__)
 
diff --git a/.infra/test_infra/eks/__init__.py b/infra/test_infra/eks/__init__.py
similarity index 100%
rename from .infra/test_infra/eks/__init__.py
rename to infra/test_infra/eks/__init__.py
diff --git a/.infra/test_infra/eks/multinode_heavy/__init__.py b/infra/test_infra/eks/multinode_heavy/__init__.py
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/__init__.py
rename to infra/test_infra/eks/multinode_heavy/__init__.py
diff --git a/.infra/test_infra/eks/multinode_heavy/eks-cluster.yaml b/infra/test_infra/eks/multinode_heavy/eks-cluster.yaml
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/eks-cluster.yaml
rename to infra/test_infra/eks/multinode_heavy/eks-cluster.yaml
diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml b/infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml
rename to infra/test_infra/eks/multinode_heavy/fsx-lustre-pv.yaml
diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml b/infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml
rename to infra/test_infra/eks/multinode_heavy/fsx-lustre-pvc.yaml
diff --git a/.infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml b/infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml
rename to infra/test_infra/eks/multinode_heavy/fsx-storage-class.yaml
diff --git a/.infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml b/infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml
similarity index 100%
rename from .infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml
rename to infra/test_infra/eks/multinode_heavy/large-model-nodegroup.yaml
diff --git a/.infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py
similarity index 91%
rename from .infra/test_infra/eks/setup.py
rename to infra/test_infra/eks/setup.py
index a6478f7c8b8c..9e287e77ddde 100644
--- a/.infra/test_infra/eks/setup.py
+++ b/infra/test_infra/eks/setup.py
@@ -2,11 +2,7 @@
 import sys
 from invoke.context import Context
 from codebuild_environment import get_cloned_folder_path
-
-current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(current_dir)
-
-from test_infra_utils import create_logger
+from infra.test_infra.test_infra_utils import create_logger
 
 LOGGER = create_logger(__name__)
 
diff --git a/.infra/test_infra/entrypoint.py b/infra/test_infra/entrypoint.py
similarity index 93%
rename from .infra/test_infra/entrypoint.py
rename to infra/test_infra/entrypoint.py
index 48974ec01b4b..7fe859bc087c 100644
--- a/.infra/test_infra/entrypoint.py
+++ b/infra/test_infra/entrypoint.py
@@ -4,13 +4,10 @@
 from test.test_utils import get_dlc_images
 from codebuild_environment import get_cloned_folder_path
 
-current_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(current_dir)
 
-
-from ec2.setup import EC2Platform
-from eks.setup import EKSPlatform
-from test_infra_utils import (
+from infra.test_infra.ec2.setup import EC2Platform
+from infra.test_infra.eks.setup import EKSPlatform
+from infra.test_infra.test_infra_utils import (
     create_logger,
     parse_buildspec,
     validate_and_filter_tests,
diff --git a/.infra/test_infra/test_infra_utils.py b/infra/test_infra/test_infra_utils.py
similarity index 96%
rename from .infra/test_infra/test_infra_utils.py
rename to infra/test_infra/test_infra_utils.py
index 9cf6f50d6c76..a98707678797 100644
--- a/.infra/test_infra/test_infra_utils.py
+++ b/infra/test_infra/test_infra_utils.py
@@ -7,10 +7,7 @@
 from src.buildspec import Buildspec
 from test.test_utils import get_buildspec_path
 from codebuild_environment import get_cloned_folder_path
-
-current_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(current_dir)
-from validators.platform_validator_utils import get_platform_validator
+from infra.test_infra.validators.platform_validator_utils import get_platform_validator
 
 
 def create_logger(name, level=logging.INFO):
diff --git a/.infra/test_infra/validators/__init__.py b/infra/test_infra/validators/__init__.py
similarity index 100%
rename from .infra/test_infra/validators/__init__.py
rename to infra/test_infra/validators/__init__.py
diff --git a/.infra/test_infra/validators/base_platform_validator.py b/infra/test_infra/validators/base_platform_validator.py
similarity index 100%
rename from .infra/test_infra/validators/base_platform_validator.py
rename to infra/test_infra/validators/base_platform_validator.py
diff --git a/.infra/test_infra/validators/platform_configs.py b/infra/test_infra/validators/platform_configs.py
similarity index 100%
rename from .infra/test_infra/validators/platform_configs.py
rename to infra/test_infra/validators/platform_configs.py
diff --git a/.infra/test_infra/validators/platform_validator_utils.py b/infra/test_infra/validators/platform_validator_utils.py
similarity index 74%
rename from .infra/test_infra/validators/platform_validator_utils.py
rename to infra/test_infra/validators/platform_validator_utils.py
index 092544ed72ea..d1513a008865 100644
--- a/.infra/test_infra/validators/platform_validator_utils.py
+++ b/infra/test_infra/validators/platform_validator_utils.py
@@ -1,5 +1,5 @@
-from .base_platform_validator import BasePlatformValidator
-from .platform_validators import EC2MultiNodeValidator, EKSValidator
+from infra.test_infra.validators.base_platform_validator import BasePlatformValidator
+from infra.test_infra.validators.platform_validators import EC2MultiNodeValidator, EKSValidator
 
 
 _VALIDATORS = {"ec2-multi-node": EC2MultiNodeValidator, "eks": EKSValidator}
diff --git a/.infra/test_infra/validators/platform_validators.py b/infra/test_infra/validators/platform_validators.py
similarity index 87%
rename from .infra/test_infra/validators/platform_validators.py
rename to infra/test_infra/validators/platform_validators.py
index 7f3febddab18..a1a9f1a41c6b 100644
--- a/.infra/test_infra/validators/platform_validators.py
+++ b/infra/test_infra/validators/platform_validators.py
@@ -1,6 +1,6 @@
 from typing import List, Dict
-from .base_platform_validator import BasePlatformValidator
-from .platform_configs import EC2Config, EKSConfig
+from infra.test_infra.validators.base_platform_validator import BasePlatformValidator
+from infra.test_infra.validators.platform_configs import EC2Config, EKSConfig
 
 
 class EC2MultiNodeValidator(BasePlatformValidator):
diff --git a/test/testrunner.py b/test/testrunner.py
index 796231486375..9ff18fa19539 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -3,7 +3,6 @@
 import sys
 import logging
 import re
-import importlib.util
 
 from multiprocessing import Pool, Manager
 from datetime import datetime
@@ -34,6 +33,7 @@
 from test_utils import KEYS_TO_DESTROY_FILE
 from test_utils.pytest_cache import PytestCache
 from test.vllm.trigger_test import test as test_vllm
+from infra.test_infra.entrypoint import main as run_new_tests
 
 from src.codebuild_environment import get_codebuild_project_name
 
@@ -438,14 +438,6 @@ def main():
                 try:
                     LOGGER.info(f"Running vLLM EKS EC2 tests with image: {all_image_list[0]}")
                     if new_test_structure_enabled:
-                        project_root = os.path.dirname(os.path.dirname(os.getcwd()))
-                        spec = importlib.util.spec_from_file_location(
-                            "entrypoint",
-                            os.path.join(project_root, ".infra", "test_infra", "entrypoint.py"),
-                        )
-                        entrypoint_module = importlib.util.module_from_spec(spec)
-                        spec.loader.exec_module(entrypoint_module)
-                        run_new_tests = entrypoint_module.main
                         LOGGER.info("Using new buildspec-based test system")
                         run_new_tests()
                     else:

From 815527322cec7018a2d1942a53b67116aea30d6a Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 12:23:25 -0700
Subject: [PATCH 02/33] copy and modify ec2 infra utils

---
 infra/test_infra/ec2/utils.py | 2183 +++++++++++++++++++++++++++++++++
 1 file changed, 2183 insertions(+)
 create mode 100644 infra/test_infra/ec2/utils.py

diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py
new file mode 100644
index 000000000000..86e3d0b8a3a8
--- /dev/null
+++ b/infra/test_infra/ec2/utils.py
@@ -0,0 +1,2183 @@
+import os
+import time
+import re
+import logging
+import sys
+import uuid
+import copy
+
+from random import randint
+
+from collections import Counter
+
+from inspect import signature
+
+import boto3
+
+from fabric import Connection
+from botocore.config import Config
+from botocore.exceptions import ClientError
+from invoke import run
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_fixed,
+    wait_random_exponential,
+)
+
+from test.test_utils import (
+    get_synapseai_version_from_tag,
+    is_deep_canary_context,
+    is_pr_context,
+    is_mainline_context,
+    are_heavy_instance_ec2_tests_enabled,
+    login_to_ecr_registry,
+    get_account_id_from_image_uri,
+    UL_AMI_LIST,
+)
+from . import DEFAULT_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET
+from infra.test_infra.test_infra_utils import create_logger
+
+EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole"
+
+# List of instance types for which, if instance spin-up fails, the test is skipped instead of failing.
+ICE_SKIP_INSTANCE_LIST = []
+
+# List of instance types which are too powerful for minor tests
+HEAVY_INSTANCE_LIST = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"]
+
+# Flag to enable IPv6 testing
+ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true"
+
+IPV6_VPC_NAME = os.getenv("IPV6_VPC_NAME")
+
+LOGGER = create_logger(__name__)
+
+def filter_only_multi_gpu(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if get_instance_num_gpus(instance_type=instance_type) > 1
+    ]
+    return filtered_list
+
+
+def filter_only_multi_gpu_and_no_g_type(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if get_instance_num_gpus(instance_type=instance_type) > 1
+        and not instance_type.startswith("g")
+    ]
+    return filtered_list
+
+
+def filter_only_single_gpu(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if get_instance_num_gpus(instance_type=instance_type) == 1
+    ]
+    return filtered_list
+
+
+def filter_no_t32x(instance_type_list):
+    filtered_list = [
+        instance_type for instance_type in instance_type_list if instance_type != "t3.2xlarge"
+    ]
+    return filtered_list
+
+
+def is_instance_single_gpu(instance_type):
+    return get_instance_num_gpus(instance_type=instance_type) == 1
+
+
+def is_instance_multi_gpu(instance_type):
+    return get_instance_num_gpus(instance_type=instance_type) > 1
+
+
+def filter_not_heavy_instance_types(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if instance_type not in HEAVY_INSTANCE_LIST
+    ]
+    return filtered_list
+
+
+# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html
+# both g4dn and g5.24xlarge we use in RC is not RDMA read supported
+# performance test will fail if we use g5.24xlarge
+def filter_efa_instance_type(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if get_num_efa_interfaces_for_instance_type(instance_type)
+        and not instance_type.startswith("g4")
+        and not instance_type.startswith("g5")
+    ]
+    return filtered_list
+
+
+def filter_efa_only_p4_instance_type(instance_type_list):
+    filtered_list = [
+        instance_type
+        for instance_type in instance_type_list
+        if get_num_efa_interfaces_for_instance_type(instance_type)
+        and instance_type.startswith("p4")
+    ]
+    return filtered_list
+
+
+def get_cicd_instance_reserved_region(instance_type):
+    return P4DE_REGION if instance_type in ["p4de.24xlarge"] else DEFAULT_REGION
+
+
+def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type=""):
+    """
+    Helper function wrapping around get_ec2_instance_type to parametrize both ec2_instance_type
+    as well as region in cases where certain instance types are reserved in a particular region.
+    :param default: Default instance type to use
+    :param filter_function: filter_function(instance_type_list) A function that takes the list to be generated by
+    the logic of the get_ec2_instance_type function, and filters the list to only produce "acceptable" instances.
+    For example, this can be a function that only returns multi-gpu instance types from a given list of instance types.
+    :param job_type: str "training"/"inference"/"" as required by the instance-type being tested
+    :return: one item list of instance type -- this is used to parametrize tests, and parameter is required to be
+    a list.
+    """
+    instance_list = get_ec2_instance_type(default, "gpu", filter_function, job_type=job_type)
+    instance_region_list = [
+        (instance_type, get_cicd_instance_reserved_region(instance_type))
+        for instance_type in instance_list
+    ]
+    return instance_region_list
+
+
+def get_ec2_instance_type(
+    default, processor, filter_function=lambda x: x, arch_type="", job_type=""
+):
+    """
+    Get EC2 instance type from associated EC2_[CPU|GPU]_INSTANCE_TYPE env variable, or set it to a
+    default for contexts where the variable is not present (i.e. PR, Nightly, local testing)
+
+    :param default: Default instance type to use
+    :param processor: "cpu" or "gpu"
+    :param filter_function: filter_function(instance_type_list) A function that takes the list to be
+    generated by the logic of the get_ec2_instance_type function, and filters the list to only
+    produce "acceptable" instances. For example, this can be a function that only returns multi-gpu
+    instance types from a given list of instance types.
+
+    :return: one item list of instance type -- this is used to parametrize tests, and parameter is
+    required to be a list.
+    """
+    if is_pr_context() or is_deep_canary_context():
+        # This condition filters out instance types that use resources with low-availability, or
+        # use very expensive instance types.
+        if not are_heavy_instance_ec2_tests_enabled() and default in HEAVY_INSTANCE_LIST:
+            return []
+        return [default]
+
+    allowed_processors = ("cpu", "gpu", "neuronx", "neuron", "hpu")
+    job_type_str = f"_{job_type.upper()}" if job_type else ""
+    if processor not in allowed_processors:
+        raise RuntimeError(
+            f"Aborting EC2 test run. Unrecognized processor type {processor}. "
+            f"Please choose from {allowed_processors}"
+        )
+    instance_type = os.getenv(f"EC2_{processor.upper()}{job_type_str}_INSTANCE_TYPE")
+    if arch_type == "graviton" or arch_type == "arm64":
+        instance_type = os.getenv(
+            f"EC2_{processor.upper()}_{arch_type.upper()}{job_type_str}_INSTANCE_TYPE"
+        )
+    if not instance_type:
+        return []
+
+    instance_list = filter_function([instance_type] if instance_type else [])
+    return instance_list
+
+
+def get_ec2_accelerator_type(default, processor):
+    """
+    Get EC2 instance type from associated EC2_EIA_INSTANCE_TYPE env variable, or set it to a default
+    for contexts where the variable is not present (i.e. PR, Nightly, local testing)
+
+    :param default: Default accelerator instance type to use
+    :param processor: "eia"
+
+    :return: one item list of instance type -- this is used to parametrize tests, and parameter is required to be
+    a list.
+    """
+    allowed_processors = ("eia",)
+    if processor not in allowed_processors:
+        raise RuntimeError(
+            f"Aborting EC2 test run. Unrecognized processor type {processor}. "
+            f"Please choose from {allowed_processors}"
+        )
+    accelerator_type = os.getenv(f"EC2_{processor.upper()}_INSTANCE_TYPE")
+    if not accelerator_type:
+        if is_mainline_context():
+            return []
+        return [default]
+    return [accelerator_type]
+
+
+def launch_instance(
+    ami_id,
+    instance_type,
+    ec2_key_name=None,
+    region=DEFAULT_REGION,
+    user_data=None,
+    iam_instance_profile_name=None,
+    instance_name="",
+):
+    """
+    Launch an instance
+    :param ami_id: AMI ID to be used for launched instance
+    :param instance_type: Instance type of launched instance
+    :param region: Region where instance will be launched
+    :param user_data: Script to run when instance is launched as a str
+    :param iam_instance_profile_arn: EC2 Role to be attached
+    :param instance_name: Tag to display as Name on EC2 Console
+    :return: <dict> Information about the instance that was launched
+    """
+    if not ami_id:
+        raise Exception("No ami_id provided")
+    if not ec2_key_name:
+        raise Exception("Ec2 Key name must be provided")
+    client = boto3.Session(region_name=region).client("ec2")
+    LOGGER.info(f"Using AMI ID: {ami_id}")
+    volume_name = "/dev/sda1" if ami_id in UL_AMI_LIST else "/dev/xvda"
+
+    # Construct the dictionary with the arguments for API call
+    arguments_dict = {
+        "KeyName": ec2_key_name,
+        "ImageId": ami_id,
+        "InstanceType": instance_type,
+        "MaxCount": 1,
+        "MinCount": 1,
+        "TagSpecifications": [
+            {
+                "ResourceType": "instance",
+                "Tags": [{"Key": "Name", "Value": f"CI-CD {instance_name}"}],
+            },
+        ],
+        "MetadataOptions": {
+            "HttpTokens": "required",
+            "HttpEndpoint": "enabled",
+            "HttpPutResponseHopLimit": 2,
+        },
+        "BlockDeviceMappings": [
+            {
+                "DeviceName": volume_name,
+                "Ebs": {
+                    "VolumeSize": 150,
+                },
+            }
+        ],
+    }
+    if user_data:
+        arguments_dict["UserData"] = user_data
+    if iam_instance_profile_name:
+        arguments_dict["IamInstanceProfile"] = {"Name": iam_instance_profile_name}
+
+    reservations = get_available_reservations(
+        ec2_client=client, instance_type=instance_type, min_availability=arguments_dict["MinCount"]
+    )
+
+    while reservations:
+        reservation = reservations.pop(0)
+        arguments_dict["CapacityReservationSpecification"] = {
+            "CapacityReservationTarget": {
+                "CapacityReservationId": reservation["CapacityReservationId"]
+            }
+        }
+        try:
+            response = client.run_instances(**arguments_dict)
+            LOGGER.info(
+                f"Your {instance_type} reservation is ready, please wait to be seated. Launching..."
+            )
+            if is_mainline_context():
+                LOGGER.info(f"Launched instance via {reservation}")
+            return response["Instances"][0]
+        except ClientError as e:
+            LOGGER.error(f"Failed to launch via {instance_type} reservation - {e}")
+            # Refresh available reservations
+            time.sleep(randint(10, 30))
+            reservations = get_available_reservations(
+                ec2_client=client,
+                instance_type=instance_type,
+                min_availability=arguments_dict["MinCount"],
+            )
+
+    # Clean up cap reservation if we don't find one
+    arguments_dict.pop("CapacityReservationSpecification", None)
+    LOGGER.info(f"No capacity reservation available for {instance_type}, trying elsewhere...")
+    response = client.run_instances(**arguments_dict)
+
+    if not response or len(response["Instances"]) < 1:
+        raise Exception(
+            "Unable to launch the instance. \
+                         Did not return any response"
+        )
+
+    return response["Instances"][0]
+
+
+def get_available_reservations(ec2_client, instance_type, min_availability=1):
+    """
+    Get capacity reservations in our region that have our minimum availability
+
+    Args:
+        ec2_client (boto3.client): EC2 Boto3 client
+        instance_type (string): instance type, i.e. g5.8xlarge
+        min_availability (int, optional): Minimum number of instances to launch. Defaults to 1.
+
+    Returns:
+        list: list of dictionaries of reservations
+    """
+    reservations = ec2_client.describe_capacity_reservations()
+
+    open_tables = [
+        reservation
+        for reservation in reservations["CapacityReservations"]
+        if reservation["InstanceType"] == instance_type
+        and reservation["AvailableInstanceCount"] >= min_availability
+    ]
+
+    # Sort by ascending instance count and total instance count,
+    # so that we take minimum instances required, and leave other reservations
+    # open for larger parties
+    open_tables.sort(key=lambda res: res["TotalInstanceCount"])
+
+    return sorted(open_tables, key=lambda res: res["AvailableInstanceCount"])
+
+
+@retry(
+    reraise=True,
+    stop=stop_after_delay(30 * 60),  # Keep retrying for 30 minutes
+    wait=wait_random_exponential(min=60, max=5 * 60),  # Retry after waiting 1-5 minutes
+)
+def launch_instances_with_retry(
+    ec2_resource, ec2_client, availability_zone_options, ec2_create_instances_definition, fn_name=""
+):
+    """
+    Helper function to launch EC2 instances with retry capability, to allow multiple attempts
+    when facing instance capacity issues.
+    :param ec2_resource: boto3 EC2 Service Resource object
+    :param ec2_client: boto3 EC2 Client object
+    :param availability_zone_options: list of availability zones in which to try to run instances
+    :param ec2_create_instances_definition: dict of parameters to pass to
+        ec2_resource.create_instances
+    :param fn_name: string - function name for ease of logging
+    :return: list of EC2 Instance Resource objects for instances launched
+    """
+
+    instances = None
+    reservations = get_available_reservations(
+        ec2_client=ec2_client,
+        instance_type=ec2_create_instances_definition["InstanceType"],
+        min_availability=ec2_create_instances_definition["MinCount"],
+    )
+    # Look at available CRs first
+    while reservations:
+        reservation = reservations.pop(0)
+        ec2_create_instances_definition["CapacityReservationSpecification"] = {
+            "CapacityReservationTarget": {
+                "CapacityReservationId": reservation["CapacityReservationId"]
+            }
+        }
+        try:
+            instances = ec2_resource.create_instances(**ec2_create_instances_definition)
+            LOGGER.info(
+                f"Your reservation is ready for {fn_name}, please wait to be seated. Launching..."
+            )
+            if is_mainline_context():
+                LOGGER.info(f"Launched instance for {fn_name} via {reservation}")
+            return instances
+        except ClientError as e:
+            LOGGER.error(f"Failed to launch via reservation for {fn_name} - {e}")
+
+    # Clean up capacity reservation if it failed
+    ec2_create_instances_definition.pop("CapacityReservationSpecification", None)
+
+    LOGGER.info(
+        f"Looks like you didn't have a reservation for {fn_name}, let's see if we can seat you as a walk-in..."
+    )
+
+    if availability_zone_options:
+        error = None
+        for a_zone in availability_zone_options:
+            ec2_create_instances_definition["Placement"] = {"AvailabilityZone": a_zone}
+            try:
+                instances = ec2_resource.create_instances(**ec2_create_instances_definition)
+                if instances:
+                    break
+            except ClientError as e:
+                LOGGER.error(f"Failed to launch in {a_zone} due to {e} for {fn_name}")
+                error = e
+                continue
+        if not instances:
+            raise error
+    else:
+        instances = ec2_resource.create_instances(**ec2_create_instances_definition)
+    return instances
+
+
+def launch_efa(ec2_client, ec2_instance_type, ec2_run_instances_definition, availability_zone):
+    ec2_efa_run_instances_definition = copy.deepcopy(ec2_run_instances_definition)
+    ec2_efa_run_instances_definition.update(
+        {
+            "Placement": {"AvailabilityZone": availability_zone},
+            "NetworkInterfaces": generate_network_interfaces(
+                ec2_client, ec2_instance_type, availability_zone
+            ),
+        }
+    )
+    response = ec2_client.run_instances(**ec2_efa_run_instances_definition) or {}
+    return response.get("Instances")
+
+
+def launch_efa_with_reservations(
+    ec2_client, ec2_instance_type, reservations, ec2_run_instances_definition, fn_name=""
+):
+    ec2_run_instances_reserved_definition = copy.deepcopy(ec2_run_instances_definition)
+    while reservations:
+        reservation = reservations.pop(0)
+        ec2_run_instances_reserved_definition["CapacityReservationSpecification"] = {
+            "CapacityReservationTarget": {
+                "CapacityReservationId": reservation["CapacityReservationId"]
+            }
+        }
+        try:
+            instances = launch_efa(
+                ec2_client,
+                ec2_instance_type,
+                ec2_run_instances_reserved_definition,
+                reservation["AvailabilityZone"],
+            )
+            if instances:
+                LOGGER.info(
+                    f"Your EFA reservation is ready for {fn_name}, please wait to be seated. Launching..."
+                )
+                if is_mainline_context():
+                    LOGGER.info(f"Launched EFA enabled instance for {fn_name} via {reservation}")
+                return instances
+        except ClientError as e:
+            LOGGER.debug(
+                f"Failed to launch EFA instance for {fn_name} from reservation due to {e}\n"
+                "Checking additional open reservations..."
+            )
+    return []
+
+
+def validate_efa_instance_conditions(instances, minimum_number_of_instances):
+    if len(instances) == minimum_number_of_instances:
+        return True
+    if len(instances) > minimum_number_of_instances:
+        raise RuntimeError(
+            f"Launched too many instances somehow, raising and cleaning up - {instances}; min/max_allowed = {minimum_number_of_instances}"
+        )
+    return False
+
+
+class HeterogenousReservationError(Exception):
+    pass
+
+
+def referesh_capacity_reservations(ec2_client, ec2_instance_type, az):
+    reservations = [
+        reservation
+        for reservation in get_available_reservations(ec2_client, ec2_instance_type)
+        if reservation["AvailabilityZone"] == az
+    ]
+
+    available_instances = sum(
+        [reservation["AvailableInstanceCount"] for reservation in reservations]
+    )
+
+    return reservations, available_instances
+
+
+def launch_efa_with_heterogenous_reservations(ec2_client, ec2_run_instances_definition, fn_name=""):
+    """
+    Launch efa instances with heterogenous reservations
+
+    Previous EFA launch code requires instances to be launched from the same command. This prohibits launching instances
+    from multiple capacity reservations if the reservation has less than the minimum available instances required (typically 2).
+
+    To remedy this, we group reservations by availability zone. If we have instances available in reservation, we
+    group by most common availability zone and try to launch multiple instances from reservation. If we do not meet our minimum
+    requirements, try launching from public pool to remedy the situation. If we launch 0 from reservation, do not
+    try launching from the public pool, and allow other functions to handle launching exclusively from public.
+
+    Args:
+        ec2_client (boto3.client): boto3 ec2 client
+        ec2_run_instances_definition (dict): key/value pairs for run instances launch cmd
+        fn_name (str, optional): pytest function name. Defaults to "".
+
+    Raises:
+        HeterogenousReservationError: Custom error handling for function failure
+
+    Returns:
+        list: launched instances
+    """
+    ec2_heterogenous_run_instances_definition = copy.deepcopy(ec2_run_instances_definition)
+    ec2_instance_type = ec2_heterogenous_run_instances_definition["InstanceType"]
+    minimum_number_of_instances = ec2_heterogenous_run_instances_definition["MinCount"]
+
+    # Reset max and min count to 1; We will
+    ec2_heterogenous_run_instances_definition["MaxCount"] = 1
+    ec2_heterogenous_run_instances_definition["MinCount"] = 1
+
+    reserved_azs = [
+        reservation["AvailabilityZone"]
+        for reservation in ec2_client.describe_capacity_reservations()["CapacityReservations"]
+        if reservation["InstanceType"] == ec2_instance_type
+    ]
+
+    tmp_reservations = get_available_reservations(
+        ec2_client=ec2_client,
+        instance_type=ec2_instance_type,
+        min_availability=ec2_heterogenous_run_instances_definition["MinCount"],
+    )
+
+    az_counter = Counter(reservation["AvailabilityZone"] for reservation in tmp_reservations)
+    az_priorities = [c[0] for c in az_counter.most_common()]
+
+    # Track all reserved availability zones, in case capacity comes later
+    for reserved_az in reserved_azs:
+        if reserved_az not in az_priorities:
+            az_priorities.append(reserved_az)
+
+    for az in az_priorities:
+        LOGGER.info(f"Checking AZ {az}")
+        # Refresh reservations for each AZ
+        reservations, available_instances = referesh_capacity_reservations(
+            ec2_client, ec2_instance_type, az
+        )
+        ec2_heterogenous_run_instances_definition["MaxCount"] = 1
+        ec2_heterogenous_run_instances_definition["MinCount"] = 1
+        instances = []
+        try:
+            while available_instances and len(instances) < minimum_number_of_instances:
+                LOGGER.info(f"trying to launch {ec2_instance_type} in {az}")
+                instance = launch_efa_with_reservations(
+                    ec2_client=ec2_client,
+                    ec2_instance_type=ec2_instance_type,
+                    reservations=reservations,
+                    ec2_run_instances_definition=ec2_heterogenous_run_instances_definition,
+                    fn_name=fn_name,
+                )
+                instances += instance
+
+                # Refresh reservations for each AZ
+                reservations, available_instances = referesh_capacity_reservations(
+                    ec2_client, ec2_instance_type, az
+                )
+
+            if validate_efa_instance_conditions(instances, minimum_number_of_instances):
+                LOGGER.info("Strung together some reservations, let's go")
+                return instances
+
+            # If we have remaining instances, try launching from public pool
+            # Try a different availability zone if we don't have any reservation launches, however. Always
+            # prioritize reservation launches in this function.
+            remaining_instances = minimum_number_of_instances - len(instances)
+            if remaining_instances != minimum_number_of_instances:
+                LOGGER.info(
+                    f"Have {remaining_instances} remaining_instances instances in {az}. Trying from public pool."
+                )
+                ec2_heterogenous_run_instances_definition["MaxCount"] = remaining_instances
+                ec2_heterogenous_run_instances_definition["MinCount"] = remaining_instances
+                instances += launch_efa(
+                    ec2_client, ec2_instance_type, ec2_heterogenous_run_instances_definition, az
+                )
+
+                if validate_efa_instance_conditions(instances, minimum_number_of_instances):
+                    LOGGER.info("Strung together some reservations and some walk-ins, let's go")
+                    return instances
+
+                # Clean up instances if this workflow did not succeed
+                LOGGER.info(
+                    f"Failed to launch enough instances from public and reservations for {fn_name}."
+                )
+                if instances:
+                    LOGGER.info(
+                        f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..."
+                    )
+                    ec2_client.terminate_instances(
+                        InstanceIds=[instance_info["InstanceId"] for instance_info in instances]
+                    )
+
+        except ClientError as e:
+            # Clean up any remaining instances
+            LOGGER.info(
+                f"Failed to launch EFA instance for {fn_name} from reservation due to {e}\n"
+                "Checking additional open reservations and cleaning up stray resources"
+            )
+            if instances:
+                LOGGER.info(
+                    f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..."
+                )
+                ec2_client.terminate_instances(
+                    InstanceIds=[instance_info["InstanceId"] for instance_info in instances]
+                )
+
+        except Exception as e:
+            if instances:
+                LOGGER.info(
+                    f"Cleaning up instances {(instance['InstanceId'] for instance in instances)}..."
+                )
+                ec2_client.terminate_instances(
+                    InstanceIds=[instance_info["InstanceId"] for instance_info in instances]
+                )
+            raise HeterogenousReservationError("Failed to launch via heterogenous approach") from e
+    return []
+
+
+@retry(
+    reraise=True,
+    stop=stop_after_delay(30 * 60),  # Keep retrying for 30 minutes
+    wait=wait_random_exponential(min=60, max=5 * 60),  # Retry after waiting 1-10 minutes
+)
+def launch_efa_instances_with_retry(
+    ec2_client,
+    ec2_instance_type,
+    availability_zone_options,
+    ec2_run_instances_definition,
+    fn_name="",
+):
+    """
+    Helper function to launch EFA-capable EC2 instances with retry capability, to allow
+    multiple attempts when facing instance capacity issues.
+    :param ec2_client: boto3 EC2 Client object
+    :param ec2_instance_type: str EC2 Instance Type
+    :param availability_zone_options: list of availability zones in which to try to run instances
+    :param ec2_run_instances_definition: dict of parameters to pass to ec2_client.run_instances
+    :param fn_name: string - function name for ease of logging
+    :return: dict response from ec2_client.run_instances
+    """
+    region = ec2_client.meta.region_name
+    LOGGER.info(f"Trying to launch {ec2_instance_type} for {fn_name} via capacity reservation...")
+
+    heterogenous_reservation_launch = launch_efa_with_heterogenous_reservations(
+        ec2_client=ec2_client,
+        ec2_run_instances_definition=ec2_run_instances_definition,
+        fn_name=fn_name,
+    )
+
+    if heterogenous_reservation_launch:
+        return heterogenous_reservation_launch
+
+    LOGGER.info(
+        f"Looks like you didn't have an EFA reservation for {fn_name}, let's see if we can seat you as a walk-in..."
+    )
+
+    instances = []
+    for availability_zone in availability_zone_options:
+        try:
+            instances = launch_efa(
+                ec2_client, ec2_instance_type, ec2_run_instances_definition, availability_zone
+            )
+            if instances:
+                break
+        except ClientError as e:
+            LOGGER.info(
+                f"Failed to launch in {availability_zone} for {fn_name} due to {e}\n"
+                "Retrying in the next availability zone."
+            )
+            continue
+    if not instances:
+        raise RuntimeError(
+            f"Unable to launch {ec2_instance_type} instances in {region} for {fn_name}"
+        )
+    return instances
+
+
+def get_ec2_client(region):
+    return boto3.client("ec2", region_name=region, config=Config(retries={"max_attempts": 10}))
+
+
+def get_instance_from_id(instance_id, region=DEFAULT_REGION):
+    """
+    Get instance information using instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <dict> Information about instance with matching instance ID
+    """
+    if not instance_id:
+        raise Exception("No instance id provided")
+    client = boto3.Session(region_name=region).client("ec2")
+    instance = client.describe_instances(InstanceIds=[instance_id])
+    if not instance:
+        raise Exception(
+            "Unable to launch the instance. \
+                         Did not return any reservations object"
+        )
+    return instance["Reservations"][0]["Instances"][0]
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def get_private_ip(instance_id, region=DEFAULT_REGION):
+    """
+    Get Private IP of instance using instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <str> Private IP Address of instance with matching instance ID
+    """
+    instance = get_instance_from_id(instance_id, region)
+    if not instance["PrivateIpAddress"]:
+        raise Exception("Private IP address not yet available")
+    return instance["PrivateIpAddress"]
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def get_public_ip(instance_id, region=DEFAULT_REGION):
+    """
+    Get Public IP of instance using instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <str> IP Address of instance with matching instance ID
+    """
+    instance = get_instance_from_id(instance_id, region)
+    if not instance["PublicIpAddress"]:
+        raise Exception("IP address not yet available")
+    return instance["PublicIpAddress"]
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def get_public_ip_from_private_dns(private_dns, region=DEFAULT_REGION):
+    """
+    Get Public IP of instance using private DNS
+    :param private_dns:
+    :param region:
+    :return: <str> IP Address of instance with matching private DNS
+    """
+    client = boto3.Session(region_name=region).client("ec2")
+    response = client.describe_instances(
+        Filters={"Name": "private-dns-name", "Value": [private_dns]}
+    )
+    return response.get("Reservations")[0].get("Instances")[0].get("PublicIpAddress")
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def get_instance_user(instance_id, region=DEFAULT_REGION):
+    """
+    Get "ubuntu" or "ec2-user" based on AMI used to launch instance
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <str> user name
+    """
+    instance = get_instance_from_id(instance_id, region)
+    user = "ubuntu" if instance["ImageId"] in UL_AMI_LIST else "ec2-user"
+    return user
+
+
+def get_instance_state(instance_id, region=DEFAULT_REGION):
+    """
+    Get state of instance using instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <str> State of instance with matching instance ID
+    """
+    instance = get_instance_from_id(instance_id, region)
+    return instance["State"]["Name"]
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def check_instance_state(instance_id, state="running", region=DEFAULT_REGION):
+    """
+    Compares the instance state with the state argument.
+    Retries 8 times with 120 seconds gap between retries.
+    :param instance_id: Instance ID to be queried
+    :param state: Expected instance state
+    :param region: Region where query will be performed
+    :return: <str> State of instance with matching instance ID
+    """
+    instance_state = get_instance_state(instance_id, region)
+    if state != instance_state:
+        raise Exception(f"Instance {instance_id} not in {state} state")
+    return instance_state
+
+
+def get_system_state(instance_id, region=DEFAULT_REGION):
+    """
+    Returns health checks state for instances
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <tuple> System state and Instance state of instance with matching instance ID
+    """
+    if not instance_id:
+        raise Exception("No instance id provided")
+    client = boto3.Session(region_name=region).client("ec2")
+    response = client.describe_instance_status(InstanceIds=[instance_id])
+    if not response:
+        raise Exception(
+            "Unable to launch the instance. \
+                         Did not return any reservations object"
+        )
+    instance_status_list = response["InstanceStatuses"]
+    if not instance_status_list:
+        raise Exception(
+            "Unable to launch the instance. \
+                         Did not return any reservations object"
+        )
+    if len(instance_status_list) < 1:
+        raise Exception(
+            "The instance id seems to be incorrect {}. \
+                         reservations seems to be empty".format(
+                instance_id
+            )
+        )
+
+    instance_status = instance_status_list[0]
+    return (
+        instance_status["SystemStatus"]["Status"],
+        instance_status["InstanceStatus"]["Status"],
+    )
+
+
+@retry(stop=stop_after_attempt(96), wait=wait_fixed(10))
+def check_system_state(
+    instance_id, system_status="ok", instance_status="ok", region=DEFAULT_REGION
+):
+    """
+    Compares the system state (Health Checks).
+    Retries 96 times with 10 seconds gap between retries
+    :param instance_id: Instance ID to be queried
+    :param system_status: Expected system state
+    :param instance_status: Expected instance state
+    :param region: Region where query will be performed
+    :return: <tuple> System state and Instance state of instance with matching instance ID
+    """
+    instance_state = get_system_state(instance_id, region=region)
+    if system_status != instance_state[0] or instance_status != instance_state[1]:
+        raise Exception(
+            "Instance {} not in \
+                         required state".format(
+                instance_id
+            )
+        )
+    return instance_state
+
+
+def terminate_instance(instance_id, region=DEFAULT_REGION):
+    """
+    Terminate EC2 instances with matching instance ID
+    :param instance_id: Instance ID to be terminated
+    :param region: Region where instance is located
+    """
+    if not instance_id:
+        raise Exception("No instance id provided")
+    client = boto3.Session(region_name=region).client("ec2")
+    response = client.terminate_instances(InstanceIds=[instance_id])
+    if not response:
+        raise Exception("Unable to terminate instance. No response received.")
+    instances_terminated = response["TerminatingInstances"]
+    if not instances_terminated:
+        raise Exception("Failed to terminate instance.")
+    if instances_terminated[0]["InstanceId"] != instance_id:
+        raise Exception("Failed to terminate instance. Unknown error.")
+
+
+def get_instance_type_details(instance_type, region=DEFAULT_REGION):
+    """
+    Get instance type details for a given instance type
+    :param instance_type: Instance type to be queried
+    :param region: Region where query will be performed
+    :return: <dict> Information about instance type
+    """
+    client = boto3.client("ec2", region_name=region)
+    response = client.describe_instance_types(InstanceTypes=[instance_type])
+    if not response or not response["InstanceTypes"]:
+        raise Exception("Unable to get instance details. No response received.")
+    if response["InstanceTypes"][0]["InstanceType"] != instance_type:
+        raise Exception(
+            f"Bad response received. Requested {instance_type} "
+            f"but got {response['InstanceTypes'][0]['InstanceType']}"
+        )
+    return response["InstanceTypes"][0]
+
+
+def get_instance_details(instance_id, region=DEFAULT_REGION):
+    """
+    Get instance details for instance with given instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <dict> Information about instance with matching instance ID
+    """
+    if not instance_id:
+        raise Exception("No instance id provided")
+    instance = get_instance_from_id(instance_id, region=region)
+    if not instance:
+        raise Exception("Could not find instance")
+
+    return get_instance_type_details(instance["InstanceType"], region=region)
+
+
+@retry(stop=stop_after_attempt(30), wait=wait_fixed(10))
+def get_instance_num_cpus(instance_id, region=DEFAULT_REGION):
+    """
+    Get number of VCPUs on instance with given instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <int> Number of VCPUs on instance with matching instance ID
+    """
+    instance_info = get_instance_details(instance_id, region=region)
+    return instance_info["VCpuInfo"]["DefaultVCpus"]
+
+
+@retry(stop=stop_after_attempt(30), wait=wait_fixed(10))
+def get_instance_memory(instance_id, region=DEFAULT_REGION):
+    """
+    Get total RAM available on instance with given instance ID
+    :param instance_id: Instance ID to be queried
+    :param region: Region where query will be performed
+    :return: <int> Total RAM available on instance with matching instance ID
+    """
+    instance_info = get_instance_details(instance_id, region=region)
+    return instance_info["MemoryInfo"]["SizeInMiB"]
+
+
+@retry(stop=stop_after_attempt(30), wait=wait_fixed(10))
+def get_instance_num_inferentias(instance_id=None, instance_type=None, region=DEFAULT_REGION):
+    """
+    Get total number of neurons on instance with given instance ID
+    :param instance_id: Instance ID to be queried
+    :param instance_type: Instance Type to be queried
+    :param region: Region where query will be performed
+    :return: <int> Number of neurons on instance with matching instance ID
+    """
+    assert instance_id or instance_type, "Input must be either instance_id or instance_type"
+    instance_info = (
+        get_instance_type_details(instance_type, region=region)
+        if instance_type
+        else get_instance_details(instance_id, region=region)
+    )
+    return sum(
+        neuron_type["Count"]
+        for neuron_type in instance_info["InferenceAcceleratorInfo"]["Accelerators"]
+        if neuron_type["Name"] == "Inferentia"
+    )
+
+
+@retry(stop=stop_after_attempt(30), wait=wait_fixed(10))
+def get_instance_num_gpus(instance_id=None, instance_type=None, region=DEFAULT_REGION):
+    """
+    Get total number of GPUs on instance with given instance ID
+    :param instance_id: Instance ID to be queried
+    :param instance_type: Instance Type to be queried
+    :param region: Region where query will be performed
+    :return: <int> Number of GPUs on instance with matching instance ID
+    """
+    assert instance_id or instance_type, "Input must be either instance_id or instance_type"
+    instance_info = (
+        get_instance_type_details(instance_type, region=region)
+        if instance_type
+        else get_instance_details(instance_id, region=region)
+    )
+    return sum(gpu_type["Count"] for gpu_type in instance_info["GpuInfo"]["Gpus"])
+
+
+@retry(stop=stop_after_attempt(30), wait=wait_fixed(10))
+def get_num_efa_interfaces_for_instance_type(instance_type, region=DEFAULT_REGION):
+    """
+    Get the maximum number of EFA interfaces available on a particular instance type
+    :param instance_type: str EC2 Instance type
+    :param region: str Region where ec2 instance must be launched
+    :return: NoneType/int Number of EFA interfaces that can be created on the given instance type.
+    Can be None if instance_type doesn't support EFA.
+    """
+    instance_info = get_instance_type_details(instance_type, region)
+    num_efa_interfaces = (
+        instance_info.get("NetworkInfo", {}).get("EfaInfo", {}).get("MaximumEfaInterfaces")
+    )
+    return num_efa_interfaces
+
+
+def get_ec2_fabric_connection(instance_id, instance_pem_file, region):
+    """
+    establish connection with EC2 instance if necessary
+    :param instance_id: ec2_instance id
+    :param instance_pem_file: instance key name
+    :param region: Region where ec2 instance is launched
+    :return: Fabric connection object
+    """
+    user = get_instance_user(instance_id, region=region)
+    conn = Connection(
+        user=user,
+        host=get_public_ip(instance_id, region),
+        connect_kwargs={"key_filename": [instance_pem_file]},
+        connect_timeout=18000,
+    )
+    return conn
+
+
+def get_ec2_instance_tags(instance_id, region=DEFAULT_REGION, ec2_client=None):
+    ec2_client = ec2_client or get_ec2_client(region)
+    response = ec2_client.describe_tags(Filters=[{"Name": "resource-id", "Values": [instance_id]}])
+    return {tag["Key"]: tag["Value"] for tag in response.get("Tags")}
+
+
+# If IMDSv2 is enforced on EC2 instance with hop limit 1 then IMDSv2 api calls doesn't work
+# If IMDSv2 is enforced on EC2 instance with hop limit > 1 then IMDSv2 api calls work
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def enforce_IMDSv2(instance_id, hop_limit, region=DEFAULT_REGION, ec2_client=None):
+    """
+    Enable HTTP TOKENS required option on EC2 instance with given hop limit.
+
+    :param instance_id: str, ec2 instance id
+    :param region: str, Region where ec2 instance is launched.
+    :param ec2_client: str, ec2 client.
+    :param hop_limit: str, hop limit to be set on ec2 instance.
+    """
+    ec2_client = ec2_client or get_ec2_client(region)
+    response = ec2_client.modify_instance_metadata_options(
+        InstanceId=instance_id,
+        HttpTokens="required",
+        HttpPutResponseHopLimit=hop_limit,
+        HttpEndpoint="enabled",
+    )
+
+    if not response:
+        raise Exception("Unable to enforce IMDSv2. No response received.")
+
+    time.sleep(2)
+    state = None
+    if response["InstanceId"]:
+        res = ec2_client.describe_instances(InstanceIds=[instance_id])
+        if res:
+            metadata_options = res["Reservations"][0]["Instances"][0]["MetadataOptions"]
+            state = metadata_options["State"]
+            LOGGER.info(f"Modify Metadata options of EC2 instance: {metadata_options}")
+    if state != "applied":
+        raise Exception(
+            "Unable to enforce IMDSv2. Describe instance is not able to confirm if IMDSv2 enforced."
+        )
+
+
+@retry(stop=stop_after_attempt(16), wait=wait_fixed(60))
+def enforce_IMDSv1(instance_id, region=DEFAULT_REGION, ec2_client=None):
+    """
+    Enabled IMDSv1 on EC2 instance.
+
+    :param instance_id: str, ec2 instance id
+    :param region: str, Region where ec2 instance is launched.
+    :param ec2_client: str, ec2 client.
+    :param hop_limit: str, hop limit to be set on ec2 instance.
+    """
+    ec2_client = ec2_client or get_ec2_client(region)
+    response = ec2_client.modify_instance_metadata_options(
+        InstanceId=instance_id, HttpTokens="optional", HttpPutResponseHopLimit=1
+    )
+
+    if not response:
+        raise Exception("Unable to enforce IMDSv1. No response received.")
+    time.sleep(2)
+    state = None
+    if response["InstanceId"]:
+        res = ec2_client.describe_instances(InstanceIds=[instance_id])
+        if res:
+            metadata_options = res["Reservations"][0]["Instances"][0]["MetadataOptions"]
+            state = metadata_options["State"]
+            LOGGER.info(f"Modify Metadata options of EC2 instance: {metadata_options}")
+    if state != "applied":
+        raise Exception(
+            "Unable to enforce IMDSv1. Describe instance is not able to confirm if IMDSv1 enforced."
+        )
+
+
+def fetch_s3_file_and_get_last_line(s3_location, local_filename="temp.txt"):
+    """
+    Fetches the s3 file locally and extracts its last line.
+
+    :param s3_location: str, s3 uri
+    :param local_filename: str, location where s3 file is to be downloaded locally.
+    :return: str, The last line of the file
+    """
+    run(f"rm -rf {local_filename}", hide=True)
+    run(f"aws s3 cp {s3_location} {local_filename}", hide=True)
+    last_line_of_file = run(f"tail -n1 {local_filename}", hide=True).stdout.strip()
+    return last_line_of_file
+
+
+def execute_asynchronus_testing_using_s3_bucket(
+    connection,
+    execution_command,
+    connection_timeout,
+    required_log_ending,
+    loop_time=2.5 * 3600,
+    log_location_within_ec2="~/container_tests/logs.txt",
+    s3_uri_for_saving_permanent_logs=None,
+    hang_detection_window=3,
+):
+    """
+    This method uses fabric to run the provided execution_command in asynchronus mode. While the execution command
+    is being executed in the image, it keeps on uploading the logs to the s3 bucket at fixed intervals. After a
+    loop_time is over, it checks the last line of the uploaded logs to see if it is same as required_log_ending.
+    This is mainly used in cases where Fabric behaves in an undesired way due to long living connections.
+
+    :param connection: Fabric connection object
+    :param execution_command: str, command that connection.run() will execute
+    :param connection_timeout: timeout for fabric connection
+    :param required_log_ending: str, The string that is desired to be present at the end of the logs
+    :param loop_time: int, seconds for which we would wait for the tests to execute on ec2 instance
+    :param log_location_within_ec2: Location within ec2 instance where the logs are being witten.
+    :param s3_uri_for_saving_permanent_logs: Location where permanent s3 logs could be saved.
+    :param hang_detection_window: int, This method detects a hang if length of log file does not change for hang_detection_window number of iterations.
+    """
+    account_id = os.getenv("ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"])
+    s3_bucket_name = f"dlc-async-test-{account_id}"
+    if not s3_uri_for_saving_permanent_logs:
+        unique_id = str(uuid.uuid4())
+        unique_id_with_timestamp = f"{unique_id}-{int(time.time())}"
+        s3_location = f"s3://{s3_bucket_name}/{unique_id_with_timestamp}.txt"
+    else:
+        s3_location = s3_uri_for_saving_permanent_logs
+    connection.run(execution_command, hide=True, timeout=connection_timeout, asynchronous=True)
+    start_time = int(time.time())
+    loop_count = 0
+    local_filename = s3_location.replace(":", "-").replace("/", "-")
+    last_line_of_log = ""
+    line_count_list = []
+    while (int(time.time()) - start_time <= loop_time) and (
+        not last_line_of_log.endswith(required_log_ending)
+    ):
+        time.sleep(5 * 60)
+        loop_count += 1
+        connection.run(
+            f"aws s3 cp {log_location_within_ec2} {s3_location}", timeout=connection_timeout
+        )
+        last_line_of_log = fetch_s3_file_and_get_last_line(s3_location, local_filename)
+        number_of_lines_in_log_file = int(
+            run(f"wc -l {local_filename}", hide=True).stdout.strip().split()[0]
+        )
+        line_count_list.append(number_of_lines_in_log_file)
+        number_of_previous_line_counts_to_check = hang_detection_window
+        if len(line_count_list) >= number_of_previous_line_counts_to_check:
+            if all(
+                line_count == line_count_list[-1]
+                for line_count in line_count_list[-number_of_previous_line_counts_to_check:]
+            ):
+                # If last 3 runs lead to same line number then it demonstrates no progress and hence we stop.
+                LOGGER.info(
+                    f"No progress reported for past {number_of_previous_line_counts_to_check} iterations. Job most likely hanged so stopping the execution!!"
+                )
+                break
+        LOGGER.info(f"Uploaded file to {s3_location} for {loop_count} number of times")
+
+    if not last_line_of_log.endswith(required_log_ending):
+        raise ValueError(
+            f""" Test failed because the last row is not as expected. \n"""
+            f""" Last row in the log file ===> {last_line_of_log} \n"""
+            f""" expected ===> {required_log_ending}. \n"""
+            f""" Full log ===> {s3_location} \n"""
+        )
+
+
+def get_s3_uri_for_saving_permanent_logs(
+    framework, s3_bucket, test_type="ec2", custom_filename=None
+):
+    """
+    Helper function to get s3 uri where log files generated within test ec2 instances will be uploaded to.
+
+    :param framework: str, tensorflow, pytorch etc.
+    :param s3_bucket: str, name of the bucket where we want to upload the logs.
+    :param test_type: str, type of the test
+    :param custom_filename: str, custom name of the file that will be prepended with unique id to create the s3 filepath
+    """
+    commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", f"default-{int(time.time())}")
+    unique_id = str(uuid.uuid4())
+    unique_id_with_timestamp = f"{unique_id}-{int(time.time())}"
+    if custom_filename:
+        filename = f"{custom_filename}-logs-{unique_id_with_timestamp}.txt"
+    else:
+        filename = f"logs-{unique_id_with_timestamp}.txt"
+    s3_filepath = os.path.join(s3_bucket, test_type, framework, commit_id, filename)
+    s3_permanent_log_upload_uri = f"s3://{s3_filepath}"
+    return s3_permanent_log_upload_uri
+
+
+def execute_ec2_training_test(
+    connection,
+    ecr_uri,
+    test_cmd,
+    region=DEFAULT_REGION,
+    executable="bash",
+    large_shm=False,
+    host_network=False,
+    container_name="ec2_training_container",
+    timeout=18000,
+    bin_bash_entrypoint=False,
+    enable_habana_async_execution=False,
+    enable_gdrcopy=False,
+):
+    if executable not in ("bash", "python"):
+        raise RuntimeError(
+            f"This function only supports executing bash or python commands on containers"
+        )
+    if executable == "bash":
+        executable = os.path.join(os.sep, "bin", "bash")
+    docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else ""
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+    synapseai_version = get_synapseai_version_from_tag(ecr_uri)
+    # Make sure we are logged into ECR so we can pull the image
+    account_id = get_account_id_from_image_uri(ecr_uri)
+    login_to_ecr_registry(connection, account_id, region)
+
+    # Run training command
+    shm_setting = '--shm-size="1g"' if large_shm else ""
+    network = '--network="host" ' if host_network else ""
+    container_runtime = "--runtime=habana -e HABANA_VISIBLE_DEVICES=all" if "hpu" in ecr_uri else ""
+    ompi_mca_btl = "-e OMPI_MCA_btl_vader_single_copy_mechanism=none" if "hpu" in ecr_uri else ""
+    cap_add = "--cap-add=sys_nice" if "hpu" in ecr_uri else ""
+    ipc = "--ipc=host" if "hpu" in ecr_uri and "pytorch" in ecr_uri else ""
+    hpu_env_vars = f"-e GIT_BRANCH={synapseai_version}" if "hpu" in ecr_uri else ""
+    habana_container_test_repo = (
+        "-v ${HOME}/gaudi-test-suite:/gaudi-test-suite" if "hpu" in ecr_uri else ""
+    )
+    neuron_device = "--device=/dev/neuron0" if "neuron" in ecr_uri else ""
+    gdr_device = "--device=/dev/gdrdrv" if enable_gdrcopy else ""
+    bin_bash_cmd = "--entrypoint /bin/bash " if bin_bash_entrypoint else ""
+
+    LOGGER.info(f"execute_ec2_training_test pulling {ecr_uri}, with cmd {test_cmd}")
+    connection.run(f"docker pull {ecr_uri}", hide="out")
+    connection.run(
+        f"docker run {docker_runtime} --name {container_name} "
+        f"{container_runtime} {ompi_mca_btl} {cap_add} {hpu_env_vars} "
+        f"{ipc} {network}-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} "
+        f"{habana_container_test_repo} {shm_setting} {neuron_device} {gdr_device} -itd {bin_bash_cmd}{ecr_uri}",
+        hide=True,
+    )
+
+    if "habana" in ecr_uri:
+        execution_command = f"docker exec --user root {container_name} {executable} -c '{test_cmd}'"
+        required_log_ending = "Kudos!! Habana tests executed successfully"
+        framework = (
+            "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None
+        )
+        test_type = "ec2"
+        account_id_prefix = os.getenv(
+            "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"]
+        )[:3]
+        s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}"
+        s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs(
+            framework, s3_bucket=s3_bucket_for_permanent_logs, test_type=test_type
+        )
+        if enable_habana_async_execution == True:
+            execute_asynchronus_testing_using_s3_bucket(
+                connection,
+                execution_command,
+                timeout,
+                required_log_ending,
+                loop_time=4 * 3600,
+                s3_uri_for_saving_permanent_logs=s3_uri_permanent_logs,
+                hang_detection_window=15,
+            )
+            return
+        else:
+            run_output = connection.run(execution_command, hide=True, timeout=timeout)
+            try:
+                connection.run(f"aws s3 cp ~/container_tests/logs.txt {s3_uri_permanent_logs}")
+                LOGGER.info(f"Uploaded logs at: {s3_uri_permanent_logs}")
+            except:
+                LOGGER.info(f"Could not upload the logs")
+            return run_output
+
+    # Hack not sure why but see the following. since not using latest driver yet in the AMI, doing this for now
+    # [  214.939271] Neuron Driver Started with Version:2.x.381.0-b70a76a18efb5e89ffed987461e9a1009d8b6f1e
+    # [  214.939619] neuron-driver 0000:00:1e.0: BAR 4: can't reserve [mem 0x1000000000-0x17ffffffff 64bit pref]
+    if "neuron" in ecr_uri:
+        connection.run(f"sudo modprobe -r neuron  && sudo modprobe -i neuron")
+
+    LOGGER.info(f"execute_ec2_training_test running {ecr_uri}, with cmd {test_cmd}")
+    ec2_res = connection.run(
+        f"docker exec --user root {container_name} {executable} -c '{test_cmd}'",
+        hide=True,
+        timeout=timeout,
+    )
+    LOGGER.info(f"execute_ec2_training_test completed {ecr_uri}, with cmd {test_cmd}")
+    return ec2_res
+
+
+def execute_ec2_telemetry_test(
+    connection,
+    ecr_uri,
+    call_type,
+    container_name,
+    test_cmd,
+    opt_in=False,
+    region=DEFAULT_REGION,
+    timeout=900,
+):
+    """
+    Execute telemetry tests on EC2 instances using Docker containers.
+
+    Args:
+        connection: EC2 connection object
+        ecr_uri (str): ECR image URI
+        call_type (str): Type of test to run ('bashrc', 'entrypoint', 'framework')
+        container_name (str): Base name for the container
+        test_cmd (str): Test command to execute
+        opt_in (bool): Whether to run in opt-in mode (default: False)
+        region (str): AWS region
+        timeout (int): Timeout in seconds (default: 900)
+
+    Returns:
+        Result object from the connection.run command
+
+    Raises:
+        RuntimeError: If invalid call_type is provided
+    """
+    # Validate call type
+    VALID_CALL_TYPES = {"bashrc", "entrypoint", "framework"}
+    if call_type not in VALID_CALL_TYPES:
+        raise RuntimeError(f"Invalid call_type. Must be one of: {', '.join(VALID_CALL_TYPES)}")
+
+    # Set up Docker runtime configuration
+    docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else ""
+    if "pytorch" in ecr_uri:
+        framework_env = f"-e FRAMEWORK='torch'"
+    elif "tensorflow" in ecr_uri:
+        framework_env = f"-e FRAMEWORK='tensorflow'"
+    else:
+        framework_env = ""
+    opt_out_env = "" if opt_in else "-e OPT_OUT_TRACKING='true'"
+
+    # Set up container and mount configuration
+    test_suffix = "opt_in" if opt_in else "opt_out"
+    container_name = (
+        f"{container_name}_{call_type}_{test_suffix}"
+        if call_type in {"bashrc", "entrypoint"}
+        else f"{container_name}_{call_type}"
+    )
+
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+    mount_path = f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')}"
+
+    # Prepare test command
+    test_cmd = f"{test_cmd} {call_type} {test_suffix}"
+    LOGGER.info(f"Executing test: {test_cmd}")
+
+    # for entrypoint test, we aviod invoking bashrc telemetry
+    nobashrc_cmd = f"bash --norc" if call_type == "entrypoint" else ""
+
+    # for other tests, we need to aviod using entrypoint telemetry
+    entrypoint_override = f"--entrypoint /bin/bash" if call_type != "entrypoint" else ""
+
+    try:
+        # Login to ECR and pull image
+        account_id = get_account_id_from_image_uri(ecr_uri)
+        login_to_ecr_registry(connection, account_id, region)
+
+        LOGGER.info(f"Pulling image: {ecr_uri}")
+        connection.run(f"docker pull {ecr_uri}", hide="out")
+
+        # Execute test based on call type
+        # Start container
+        connection.run(
+            f"docker run {docker_runtime} --name {container_name} "
+            f" {mount_path} "
+            f"-itd -e TEST_MODE='1' {framework_env} {opt_out_env} {entrypoint_override} {ecr_uri} {nobashrc_cmd}",
+            hide=True,
+        )
+
+        # Execute test command
+        ec2_res = connection.run(
+            f"docker exec --user root {container_name} bash -c '{test_cmd}'",
+            hide=True,
+            timeout=timeout,
+        )
+
+        LOGGER.info(f"Test completed for {call_type} on {ecr_uri}")
+        return ec2_res
+
+    except Exception as e:
+        LOGGER.error(f"Test failed: {str(e)}")
+        raise
+
+
+def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REGION):
+    docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else ""
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+
+    # Make sure we are logged into ECR so we can pull the image
+    account_id = get_account_id_from_image_uri(ecr_uri)
+    login_to_ecr_registry(connection, account_id, region)
+
+    # Run training command
+    connection.run(
+        f"docker run {docker_runtime} --name ec2_inference_container -v {container_test_local_dir}:{os.path.join(os.sep, 'test')}"
+        f" -itd {ecr_uri} bash",
+        hide=True,
+    )
+    connection.run(
+        f"docker exec --user root ec2_inference_container {os.path.join(os.sep, 'bin', 'bash')} -c '{test_cmd}'",
+        hide=True,
+        timeout=3000,
+    )
+
+
+def execute_ec2_training_performance_test(
+    connection,
+    ecr_uri,
+    test_cmd,
+    region=DEFAULT_REGION,
+    post_process=None,
+    data_source="",
+    threshold=None,
+):
+    docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else ""
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+
+    timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    log_name = (
+        f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt"
+    )
+    log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name)
+
+    # Make sure we are logged into ECR so we can pull the image
+    account_id = get_account_id_from_image_uri(ecr_uri)
+    login_to_ecr_registry(connection, account_id, region)
+
+    connection.run(f"docker pull {ecr_uri}", hide=True)
+
+    # Run training command, display benchmark results to console
+    connection.run(
+        f"docker run {docker_runtime} --user root "
+        f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} "
+        f"-e PR_CONTEXT={1 if is_pr_context() else 0} "
+        f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {ecr_uri} "
+        f"{os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}"
+    )
+    ec2_performance_upload_result_to_s3_and_validate(
+        connection,
+        ecr_uri,
+        log_location,
+        data_source,
+        threshold,
+        post_process,
+        log_name,
+    )
+
+
+def execute_ec2_habana_training_performance_test(
+    connection,
+    ecr_uri,
+    test_cmd,
+    region=DEFAULT_REGION,
+    data_source="",
+    cards_num=None,
+    timeout=18000,
+):
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+
+    timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    log_name = (
+        f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt"
+    )
+    synapseai_version = get_synapseai_version_from_tag(ecr_uri)
+    # Make sure we are logged into ECR so we can pull the image
+    account_id = get_account_id_from_image_uri(ecr_uri)
+    login_to_ecr_registry(connection, account_id, region)
+
+    connection.run(f"docker pull -q {ecr_uri}")
+
+    container_runtime = "--runtime=habana -e HABANA_VISIBLE_DEVICES=all"
+    hpu_env_vars = f"-e CARDS_NUM={cards_num} -e GIT_BRANCH={synapseai_version}"
+    ompi_mca_btl = "-e OMPI_MCA_btl_vader_single_copy_mechanism=none"
+    cap_add = "--cap-add=sys_nice"
+    ipc = "--ipc=host" if "pytorch" in ecr_uri else ""
+    habana_container_test_repo = "${HOME}/gaudi-test-suite:/gaudi-test-suite"
+    execution_command = (
+        f"docker run --user root "
+        f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} "
+        f"-e PR_CONTEXT={1 if is_pr_context() else 0} "
+        f"{container_runtime} {ompi_mca_btl} {hpu_env_vars} {cap_add} {ipc} "
+        f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} -v {habana_container_test_repo} "
+        f"{ecr_uri} {os.path.join(os.sep, 'bin', 'bash')} -c '{test_cmd}'"
+    )
+
+    framework = (
+        "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None
+    )
+    account_id_prefix = os.getenv(
+        "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"]
+    )[:3]
+    s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}"
+    test_type = "benchmark"
+    custom_filename = test_cmd.split(f"{os.sep}")[-1]
+    custom_filename += f"-cards-{cards_num}" if cards_num else "-cards-0"
+    s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs(
+        framework,
+        s3_bucket=s3_bucket_for_permanent_logs,
+        test_type=test_type,
+        custom_filename=custom_filename,
+    )
+    required_log_ending = "Kudos!! Habana tests executed successfully"
+    execute_asynchronus_testing_using_s3_bucket(
+        connection,
+        execution_command,
+        timeout,
+        required_log_ending,
+        loop_time=4 * 3600,
+        s3_uri_for_saving_permanent_logs=s3_uri_permanent_logs,
+        hang_detection_window=15,
+    )
+    LOGGER.info(f"Uploaded logs at: {s3_uri_permanent_logs}")
+    return
+
+
+def execute_ec2_inference_performance_test(
+    connection,
+    ecr_uri,
+    test_cmd,
+    region=DEFAULT_REGION,
+    post_process=None,
+    data_source="",
+    threshold=None,
+):
+    docker_runtime = "--runtime=nvidia --gpus all" if "gpu" in ecr_uri else ""
+    container_test_local_dir = os.path.join("$HOME", "container_tests")
+    timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
+    log_name = (
+        f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt"
+    )
+    # Make sure we are logged into ECR so we can pull the image
+    account_id = get_account_id_from_image_uri(ecr_uri)
+    login_to_ecr_registry(connection, account_id, region)
+    connection.run(f"docker pull -q {ecr_uri}")
+
+    # Run training command, display benchmark results to console
+    repo_name, image_tag = ecr_uri.split("/")[-1].split(":")
+    container_name = f"{repo_name}-performance-{image_tag}-ec2"
+    connection.run(
+        f"docker run {docker_runtime} -d --name {container_name} "
+        f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} "
+        f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {ecr_uri}"
+    )
+    try:
+        connection.run(
+            f"docker exec --user root {container_name} "
+            f"{os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}"
+        )
+    except Exception as e:
+        raise Exception("Failed to exec benchmark command.\n", e)
+    finally:
+        connection.run(f"docker rm -f {container_name}")
+    log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name)
+    ec2_performance_upload_result_to_s3_and_validate(
+        connection,
+        ecr_uri,
+        log_location,
+        data_source,
+        threshold,
+        post_process,
+        log_name,
+    )
+
+
+def ec2_performance_upload_result_to_s3_and_validate(
+    connection,
+    ecr_uri,
+    log_location,
+    data_source,
+    threshold,
+    post_process,
+    log_name,
+    instance_type=None,
+):
+    framework = (
+        "tensorflow" if "tensorflow" in ecr_uri else "mxnet" if "mxnet" in ecr_uri else "pytorch"
+    )
+    framework_version = re.search(r"\d+(\.\d+){2}", ecr_uri).group()
+    py_version = "py2" if "py2" in ecr_uri else "py37" if "py37" in ecr_uri else "py3"
+    processor = "gpu" if "gpu" in ecr_uri else "cpu"
+    work_type = "training" if "training" in ecr_uri else "inference"
+    s3_location = os.path.join(
+        BENCHMARK_RESULTS_S3_BUCKET,
+        framework,
+        framework_version,
+        "ec2",
+        work_type,
+        processor,
+        py_version,
+        log_name,
+    )
+    params = {"connection": connection, "log_location": log_location}
+    if "threshold" in signature(post_process).parameters:
+        params["threshold"] = threshold
+    performance_number = post_process(**params)
+    unit = (
+        "s"
+        if work_type == "inference" and framework == "tensorflow"
+        else (
+            "ms"
+            if work_type == "inference" and framework == "pytorch"
+            else (
+                "s/epoch"
+                if work_type == "training" and framework == "pytorch" and data_source == "imagenet"
+                else "images/sec"
+            )
+        )
+    )
+    description = "p99 latency " if unit == "s" or unit == "ms" else ""
+    for k, v in performance_number.items():
+        performance_statement = (
+            f"{framework} {framework_version} ec2 {work_type} {processor} {py_version} "
+            f"{instance_type if instance_type else ''} {data_source} {k} {description}: {v} {unit}, threshold: {threshold[k]} {unit}"
+        )
+        connection.run(f"echo {performance_statement} | sudo tee -a {log_location}")
+        LOGGER.info(f"{performance_statement}")
+    connection.run(f"aws s3 cp {log_location} {s3_location}")
+    LOGGER.info(f"To retrieve complete benchmark log, check {s3_location}")
+
+    def _assertion_results():
+        if "Cost" in performance_number:
+            return performance_number["Cost"] < threshold["Cost"]
+        if "Throughput" in performance_number:
+            return performance_number["Throughput"] > threshold["Throughput"]
+        if len(performance_number) == 0:
+            return False
+        failure_count = 0
+        for k, v in performance_number.items():
+            if v > threshold[k]:
+                failure_count += 1
+        return failure_count <= 2
+
+    for _ in performance_number:
+        assert _assertion_results(), (
+            f"{framework} {framework_version} ec2 {work_type} {processor} {py_version} {data_source} "
+            f"Benchmark Result {performance_number} does not reach the threshold {threshold}"
+        )
+
+
+def post_process_inference(connection, log_location, threshold):
+    log_content = connection.run(f"cat {log_location}").stdout.split("\n")
+    performance_number = {}
+    for line in log_content:
+        if "p99" in line:
+            for key in threshold.keys():
+                if key in line:
+                    performance_number[key] = float(
+                        re.search(
+                            r"(p99[ ]*(Latency)?[ ]*:[ ]*)(?P<result>[0-9]+\.?[0-9]+)",
+                            line,
+                        ).group("result")
+                    )
+                    break
+    return performance_number
+
+
+def post_process_mxnet_ec2_performance(connection, log_location):
+    log_content = connection.run(f"cat {log_location}").stdout.split("\n")
+    total = 0.0
+    n = 0
+    for line in log_content:
+        if "samples/sec" in line and "warmup" not in line:
+            throughput = re.search(r"((?P<throughput>[0-9]+\.?[0-9]+)[ ]+samples/sec)", line).group(
+                "throughput"
+            )
+            total += float(throughput)
+            n += 1
+    if total and n:
+        return {"Throughput": total / n}
+    else:
+        raise ValueError("total: {}; n: {} -- something went wrong".format(total, n))
+
+
+def install_python_in_instance(context, python_version="3.9"):
+    """
+    Install python on DLAMI EC2 instances to create a consistent test environment that is agnostic to AMI used for test.
+    This helper function assumes that the EC2 instance uses a DLAMI. The /etc/profile.d/dlami.sh file doesn't exist
+    in other AMIs. If support for other AMIs is needed, this function will need to be updated.
+    :param context: Invoke Context / Fabric Connection object
+    :param python_version: str python version to install, such as 3.8, 3.9, etc.
+    :return: None
+    """
+    if context.run("pyenv --version", warn=True, hide=True).failed:
+        context.run(
+            """ls ~/.pyenv || git clone https://github.com/pyenv/pyenv.git ~/.pyenv""", hide=True
+        )
+
+        # for images that do not have /etc/profile.d/dlami.sh, we will make it here
+        if context.run("test -f /etc/profile.d/dlami.sh", warn=True, hide=True).failed:
+            LOGGER.info("/etc/profile.d/dlami.sh does not exist. Making...")
+            context.run("sudo touch /etc/profile.d/dlami.sh")
+            LOGGER.info("adding /etc/profile.d/dlami.sh to .bashrc")
+            context.run(
+                """echo '[ -z "$PS1" ] && source /etc/profile.d/dlami.sh'|cat - ~/.bashrc > ~/temprc """
+                """&& mv ~/temprc ~/.bashrc""",
+                hide=True,
+            )
+
+        context.run("sudo chmod 666 /etc/profile.d/dlami.sh", hide=True)
+        context.run(
+            """echo 'export PYENV_ROOT="$HOME/.pyenv"' >> /etc/profile.d/dlami.sh""", hide=True
+        )
+        context.run(
+            """echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> /etc/profile.d/dlami.sh""",
+            hide=True,
+        )
+        context.run("""echo 'eval "$(pyenv init -)"' >> /etc/profile.d/dlami.sh""", hide=True)
+        context.run("sudo chmod 644 /etc/profile.d/dlami.sh", hide=True)
+    context.run("sudo dnf update -y", hide=True)
+    context.run(
+        (
+            "sudo dnf install -y make gcc gcc-c++ openssl-devel zlib-devel "
+            "bzip2-devel readline-devel sqlite-devel llvm "
+            "ncurses-devel xz tk-devel libxml2-devel xmlsec1-devel libffi-devel xz-devel --skip-broken"
+        ),
+        hide=True,
+    )
+
+    context.run(f"pyenv install {python_version}", hide=True)
+    context.run(f"pyenv global {python_version}", hide=True)
+
+    # Validate that installed python version is the same as requested python version
+    python_version_response = context.run("python --version", hide=True)
+    python_version_match = re.search(r"Python (\d+(\.\d+)+)", python_version_response.stdout)
+    assert python_version_match, "Running 'python --version' returned None"
+    installed_python_version = python_version_match.group(1)
+    # Use SpecifierSet("=={python_version}.*") to accommodate python_version of the form X.Y as well as X.Y.Z
+    assert Version(installed_python_version) in SpecifierSet(
+        f"=={python_version}.*"
+    ), f"Installed python version {installed_python_version} does not match required python_version {python_version}"
+
+
+def get_availability_zone_ids(ec2_client):
+    """
+    Obtain list of AZs in a particular region using ec2_client
+    :param ec2_client: boto3 EC2 Client object
+    :return: list of str AZ names
+    """
+    response = ec2_client.describe_availability_zones()
+    return [az["ZoneName"] for az in response["AvailabilityZones"]]
+
+
+def get_default_vpc_id(ec2_client):
+    """
+    Get vpd-id of default VPC in a particular region using ec2_client in that region
+    :param ec2_client: boto3 EC2 Client object
+    :return: str Default vpc-id
+    """
+    response = ec2_client.describe_vpcs(Filters=[{"Name": "is-default", "Values": ["true"]}])
+    default_vpc_id = response["Vpcs"][0]["VpcId"]
+    return default_vpc_id
+
+
+def get_default_security_group_id(ec2_client):
+    """
+    Get security-group-id of default SG on the default VPC in a particular region using ec2_client
+    :param ec2_client: boto3 EC2 Client object
+    :return: str Default security-group-id
+    """
+    default_vpc_id = get_default_vpc_id(ec2_client)
+    response = ec2_client.describe_security_groups(
+        GroupNames=["default"],
+        Filters=[{"Name": "vpc-id", "Values": [default_vpc_id]}],
+    )
+    default_security_group_id = response["SecurityGroups"][0]["GroupId"]
+    return default_security_group_id
+
+
+def get_efa_enabled_security_group_id(ec2_client):
+    """
+    Get security-group-id of custom EFA-enabled SG in the default VPC in a particular region
+    :param ec2_client: boto3 EC2 Client object
+    :return: str security-group-id of SG named "EFA-enabled"
+    """
+    default_vpc_id = get_default_vpc_id(ec2_client)
+    response = ec2_client.describe_security_groups(
+        GroupNames=["EFA-enabled"],
+        Filters=[{"Name": "vpc-id", "Values": [default_vpc_id]}],
+    )
+
+    efa_security_group_id = response["SecurityGroups"][0]["GroupId"]
+    return efa_security_group_id
+
+
+def get_default_subnet_for_az(ec2_client, availability_zone):
+    """
+    Get subnet-id associated with a particular AZ using ec2_client for that region
+    :param ec2_client: boto3 EC2 Client object
+    :param availability_zone: str Availability Zone name
+    :return: str subnet-id
+    """
+    response = ec2_client.describe_subnets(
+        Filters=[
+            {"Name": "availability-zone", "Values": [availability_zone]},
+            {"Name": "default-for-az", "Values": ["true"]},
+        ]
+    )
+    az_subnet_id = response["Subnets"][0]["SubnetId"]
+    return az_subnet_id
+
+
+def get_subnet_id_by_vpc(ec2_client, vpc_id):
+
+    response = ec2_client.describe_subnets(
+        Filters=[
+            {
+                "Name": "vpc-id",
+                "Values": [
+                    vpc_id,
+                ],
+            },
+        ],
+    )
+
+    subnet_ids = []
+    for subnet in response["Subnets"]:
+        if subnet["SubnetId"] is not None:
+            subnet_ids.append(subnet["SubnetId"])
+
+    return subnet_ids
+
+
+def get_vpc_id_by_name(ec2_client, vpc_name):
+    """
+    Get VPC ID by VPC name tag
+    :param ec2_client: boto3 EC2 Client object
+    :param vpc_name: Name tag value of the VPC
+    :return: str VPC ID of the VPC name
+    """
+    response = ec2_client.describe_vpcs(Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]).get(
+        "Vpcs", []
+    )
+
+    if not response:
+        raise Exception(f"No VPC found with Name tag: {vpc_name}")
+    elif len(response) > 1:
+        raise Exception(f"Multiple VPCs found with Name tag: {vpc_name}")
+
+    vpc_id = response[0]["VpcId"]
+
+    return vpc_id
+
+
+def get_default_security_group_id_by_vpc_id(ec2_client, vpc_name):
+    """
+    Get default SG ID for a non-default VPC
+    :param ec2_client: boto3 EC2 Client object
+    :param vpc_name: Name tag value of the VPC
+    :return: str SG ID of the default SG
+    """
+    try:
+        vpc_id = get_vpc_id_by_name(ec2_client, vpc_name)
+
+        response = ec2_client.describe_security_groups(
+            Filters=[
+                {"Name": "vpc-id", "Values": [vpc_id]},
+                {"Name": "group-name", "Values": ["default"]},
+            ],
+        )
+
+        security_group_id = response["SecurityGroups"][0]["GroupId"]
+        return security_group_id
+    except Exception as e:
+        LOGGER.error(f"Error in get_default_security_group_id_by_vpc_id: {str(e)}")
+        raise
+
+
+def get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name):
+    """
+    Get EFA-enabled SG ID for IPv6 VPC by identifying security groups that allow
+    all traffic within themselves
+    :param ec2_client: boto3 EC2 Client object
+    :param vpc_name: Name tag value of the VPC
+    :return: str SG ID of the EFA-enabled SG
+    """
+    try:
+        vpc_id = get_vpc_id_by_name(ec2_client, vpc_name)
+
+        response = ec2_client.describe_security_groups(
+            Filters=[
+                {"Name": "vpc-id", "Values": [vpc_id]},
+            ]
+        )
+
+        for sg in response["SecurityGroups"]:
+            inbound_all_traffic = any(
+                rule["IpProtocol"] == "-1"
+                and any(
+                    pair["GroupId"] == sg["GroupId"] for pair in rule.get("UserIdGroupPairs", [])
+                )
+                for rule in sg["IpPermissions"]
+            )
+
+            outbound_all_traffic = any(
+                rule["IpProtocol"] == "-1"
+                and any(
+                    pair["GroupId"] == sg["GroupId"] for pair in rule.get("UserIdGroupPairs", [])
+                )
+                for rule in sg["IpPermissionsEgress"]
+            )
+
+            if inbound_all_traffic and outbound_all_traffic:
+                return sg["GroupId"]
+
+        raise ValueError(
+            f"No EFA-enabled security group found in VPC {vpc_name}. Expected a sg that allows all traffic to and from itself."
+        )
+    except Exception as e:
+        LOGGER.error(f"Error when getting IPv6 EFA-enabled sg id: {str(e)}")
+        raise
+
+
+def get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone):
+    """
+    Get IPv6-enabled subnet ID in the a particular availability zone
+    :param ec2_client: boto3 EC2 Client object
+    :param vpc_name: Name tag value of the VPC
+    :param availability_zone: str AZ name
+    :return: str Subnet ID of an IPv6-enabled subnet
+    """
+    try:
+        vpc_id = get_vpc_id_by_name(ec2_client, vpc_name)
+
+        route_tables = ec2_client.describe_route_tables(
+            Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
+        )["RouteTables"]
+
+        response = ec2_client.describe_subnets(
+            Filters=[
+                {"Name": "vpc-id", "Values": [vpc_id]},
+                {"Name": "availability-zone", "Values": [availability_zone]},
+            ]
+        )
+
+        ipv6_subnets = [
+            subnet
+            for subnet in response["Subnets"]
+            if subnet.get("Ipv6CidrBlockAssociationSet")
+            and is_public_subnet(subnet["SubnetId"], route_tables)
+        ]
+
+        if not ipv6_subnets:
+            raise Exception(
+                f"No IPv6-enabled subnet found in AZ {availability_zone} for VPC {vpc_id}"
+            )
+
+        return ipv6_subnets[0]["SubnetId"]
+    except Exception as e:
+        LOGGER.error(
+            f"Error in when getting IPv6 enabled subnet for AZ {availability_zone}: {str(e)}"
+        )
+        raise
+
+
+def is_public_subnet(subnet_id, route_tables):
+    """
+    Check if a subnet is public by verifying if it has a route table with an Internet Gateway
+    that routes all IPv4 or IPv6 traffic
+    :param subnet_id: str the subnet ID to check
+    :param route_tables: list route tables from the VPC
+    :return: True if subnet is public, False otherwise
+    """
+    for route_table in route_tables:
+        has_igw = False
+        for route in route_table.get("Routes", []):
+            if route.get("GatewayId", "").startswith("igw-"):
+                if (
+                    route.get("DestinationCidrBlock") == "0.0.0.0/0"
+                    or route.get("DestinationIpv6CidrBlock") == "::/0"
+                ):
+                    has_igw = True
+                    break
+        if not has_igw:
+            continue
+
+        # check if subnet is associated with route table
+        for association in route_table.get("Associations", []):
+            if association.get("SubnetId") == subnet_id:
+                return True
+
+    return False
+
+
+def generate_standard_dual_stack_network_interface(ec2_client, availability_zone):
+    """
+    Generate network interface configuration for dual-stack (IPv4/IPv6) instances.
+    :param ec2_client: boto3 EC2 Client
+    :param availability_zone: str AZ in which the instance must be created
+    :return: list containing a single network interface configuration for dual-stack
+    """
+    try:
+        if not IPV6_VPC_NAME:
+            raise ValueError("IPv6 VPC name is not set")
+
+        ipv6_default_sg = get_default_security_group_id_by_vpc_id(ec2_client, IPV6_VPC_NAME)
+        ipv6_subnet_id = get_ipv6_enabled_subnet_for_az(
+            ec2_client, IPV6_VPC_NAME, availability_zone
+        )
+
+        network_interfaces = [
+            {
+                "DeviceIndex": 0,
+                "DeleteOnTermination": True,
+                "Groups": [ipv6_default_sg],
+                "SubnetId": ipv6_subnet_id,
+                "Ipv6AddressCount": 1,
+            }
+        ]
+
+        return network_interfaces
+
+    except Exception as e:
+        LOGGER.error(
+            f"Failed to generate dual-stack network interface in AZ {availability_zone}: {str(e)}"
+        )
+        raise
+
+
+def generate_network_interfaces(ec2_client, ec2_instance_type, availability_zone):
+    """
+    Generate list of EFA-network-interfaces based on the number of network-interfaces available
+    on a given instance type.
+    :param ec2_client: boto3 EC2 Client
+    :param ec2_instance_type: str EC2 Instance Type with network interface to be configured
+    :param availability_zone: str AZ in which the instance must be created
+    :return: list of dicts mapping each network-interface available
+    """
+    num_efa_interfaces = get_num_efa_interfaces_for_instance_type(ec2_instance_type)
+    if not num_efa_interfaces:
+        raise AttributeError(f"Unable to get number of EFA Interfaces for {ec2_instance_type}")
+
+    if ENABLE_IPV6_TESTING:
+        vpc_name = IPV6_VPC_NAME
+        efa_sg = get_ipv6_efa_enabled_security_group_id(ec2_client, vpc_name)
+        sg_ids = [efa_sg]
+        subnet_id = get_ipv6_enabled_subnet_for_az(ec2_client, vpc_name, availability_zone)
+    else:
+        default_sg = get_default_security_group_id(ec2_client)
+        efa_sg = get_efa_enabled_security_group_id(ec2_client)
+        sg_ids = [default_sg, efa_sg]
+        subnet_id = get_default_subnet_for_az(ec2_client, availability_zone)
+
+    network_interfaces = []
+    for i in range(num_efa_interfaces):
+        interface = {
+            "DeviceIndex": 0 if i == 0 else 1,
+            "NetworkCardIndex": i,
+            "DeleteOnTermination": True,
+            "InterfaceType": "efa",
+            "Groups": sg_ids,
+            "SubnetId": subnet_id,
+        }
+
+        network_interfaces.append(interface)
+
+    return network_interfaces
+
+
+def get_network_interface_id(instance_id, region=DEFAULT_REGION):
+    """
+    Gets the network interface at index 0 from the instance_id. Meant to be used
+    with p4d instance with 4 efa devices
+    """
+    instance = get_instance_from_id(instance_id, region)
+    network_interfaces_info = instance["NetworkInterfaces"]
+    for device in network_interfaces_info:
+        if device["Attachment"]["DeviceIndex"] == 0:
+            return device["NetworkInterfaceId"]
+
+    raise Exception("Could not find network device 0, retry operation")
+
+
+def get_ipv6_address_for_eth0(instance_id, region=DEFAULT_REGION):
+    """
+    Gets the IPv6 address specifically from eth0 (Device Index 0) of an EC2 instance
+    """
+    instance = get_instance_from_id(instance_id, region)
+    network_interfaces_info = instance["NetworkInterfaces"]
+    for device in network_interfaces_info:
+        if device["Attachment"]["DeviceIndex"] == 0:
+            if device["Ipv6Addresses"]:
+                return device["Ipv6Addresses"][0]["Ipv6Address"]
+            LOGGER.info(f"No IPv6 address found on eth0 for instance {instance_id}")
+            return None
+
+    LOGGER.error(f"Could not find eth0 for instance {instance_id}")
+    return None
+
+
+def attach_elastic_ip(network_interface_id, region="us-east-1", is_ipv6=False):
+    """
+    Creates and attaches an elastic ip to a network interface which is already
+    attached to an efa enabled device. This is needed specifically for 4 efa devices
+    attached to a p4d instance. Having multiple network devices prevents automatic
+    public ip address assignment, so we must do it manually.
+    """
+    ec2_client = boto3.client("ec2", region_name=region)
+    arguments_dict = {
+        "Domain": "vpc",
+        "TagSpecifications": [
+            {
+                "ResourceType": "elastic-ip",
+                "Tags": [{"Key": "Name", "Value": f"elastic_ip_{network_interface_id}"}],
+            }
+        ],
+    }
+    elastic_ip = ec2_client.allocate_address(**arguments_dict)
+    elastic_ip_allocation_id = elastic_ip["AllocationId"]
+    response = ec2_client.associate_address(
+        AllocationId=elastic_ip_allocation_id, NetworkInterfaceId=network_interface_id
+    )
+    if is_ipv6:
+        ec2_client.assign_ipv6_addresses(
+            NetworkInterfaceId=network_interface_id, Ipv6AddressCount=1
+        )
+    return elastic_ip_allocation_id
+
+
+def delete_elastic_ips(elastic_ip_allocation_ids, ec2_client):
+    """
+    Deletes elastic ips created for efa p4d testing.
+    For default VPC (IPv4): can release directly
+    For non-default VPC (IPv6): need to disassociate before release
+    """
+    for allocation_id in elastic_ip_allocation_ids:
+        try:
+            if ENABLE_IPV6_TESTING:
+                address = ec2_client.describe_addresses(AllocationIds=[allocation_id])["Addresses"][
+                    0
+                ]
+                if "AssociationId" in address:
+                    ec2_client.disassociate_address(AssociationId=address["AssociationId"])
+                    time.sleep(10)
+            ec2_client.release_address(AllocationId=allocation_id)
+        except Exception as e:
+            LOGGER.error(f"Failed to delete elastic ip {allocation_id}: {str(e)}")
+
+
+def create_name_tags_for_instance(instance_id, name_tag, region):
+    """
+    Create name tags for an instance
+    :param instance_id: str Instance ID on which to apply the given Name tag
+    :param name_tag: str Name tag to be applied
+    :param region: str Region in which instance is running
+    """
+    ec2_client = boto3.client("ec2", region_name=region)
+    response = ec2_client.create_tags(
+        Resources=[instance_id],
+        Tags=[{"Key": "Name", "Value": name_tag}],
+    )
+    if not response:
+        raise Exception(
+            "Unable to create name tag {0} for the  instance {1}".format(name_tag, instance_id)
+        )
+
+
+def get_efa_devices_on_instance(connection):
+    """
+    Get list of EFA devices available for use in an instance
+    :param connection: Fabric Connection object
+    :return: list of str device paths
+    """
+    response = connection.run("ls /dev/infiniband/uverbs*")
+    devices = response.stdout.split()
+    return devices

From 2213423d3d86688444c5d4f12b2fa3e4d78cd67d Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 12:30:40 -0700
Subject: [PATCH 03/33] add vllm specific ec2 infra changes

---
 infra/test_infra/ec2/vllm/__init__.py       |   0
 infra/test_infra/ec2/vllm/fsx_utils.py      | 302 +++++++++
 infra/test_infra/ec2/vllm/setup_ec2.py      | 645 ++++++++++++++++++++
 infra/test_infra/ec2/vllm/setup_fsx_vllm.sh |  75 +++
 4 files changed, 1022 insertions(+)
 create mode 100644 infra/test_infra/ec2/vllm/__init__.py
 create mode 100644 infra/test_infra/ec2/vllm/fsx_utils.py
 create mode 100644 infra/test_infra/ec2/vllm/setup_ec2.py
 create mode 100644 infra/test_infra/ec2/vllm/setup_fsx_vllm.sh

diff --git a/infra/test_infra/ec2/vllm/__init__.py b/infra/test_infra/ec2/vllm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py
new file mode 100644
index 000000000000..ce27840fff90
--- /dev/null
+++ b/infra/test_infra/ec2/vllm/fsx_utils.py
@@ -0,0 +1,302 @@
+import time
+from invoke import run
+from typing import Dict, List, Any
+import boto3
+from botocore.exceptions import ClientError
+
+from infra.test_infra.test_infra_utils import create_logger
+
+LOGGER = create_logger(__name__)
+
+
+class FsxSetup:
+    """
+    A utility class for setting up and managing FSx for Lustre filesystems
+    and related AWS and Kubernetes resources.
+
+    : param region: AWS region where resources will be created (default: "us-west-2")
+    """
+
+    def __init__(self, region: str = "us-west-2"):
+        self.region = region
+
+    def create_fsx_filesystem(
+        self,
+        subnet_id: str,
+        security_group_ids: List[str],
+        storage_capacity: int,
+        deployment_type: str,
+        tags: Dict[str, str],
+    ):
+        """
+        Create FSx filesystem with given configuration
+        : param subnet_id: subnet ID where FSx will be created
+        : param security_group_ids: list of security group IDs
+        : param storage_capacity: storage capacity in GiB
+        : param deployment_type: FSx deployment type
+        : param tags: dictionary of tags to apply to the FSx filesystem
+        : return: dictionary containing filesystem details
+        """
+        tags_param = " ".join([f"Key={k},Value={v}" for k, v in tags.items()])
+
+        try:
+            fsx_id = run(
+                f"aws fsx create-file-system"
+                f" --file-system-type LUSTRE"
+                f" --storage-capacity {storage_capacity}"
+                f" --subnet-ids {subnet_id}"
+                f' --security-group-ids {" ".join(security_group_ids)}'
+                f" --lustre-configuration DeploymentType={deployment_type}"
+                f" --tags {tags_param}"
+                f" --file-system-type-version 2.15"
+                f' --query "FileSystem.FileSystemId"'
+                f" --output text"
+            ).stdout.strip()
+
+            LOGGER.info(f"Created FSx filesystem: {fsx_id}")
+
+            filesystem_info = self.wait_for_filesystem(fsx_id)
+            return filesystem_info
+
+        except Exception as e:
+            LOGGER.error(f"Failed to create FSx filesystem: {e}")
+            raise
+
+    def delete_fsx_filesystem(self, fsx_id: str):
+
+        try:
+            fsx_id = run(
+                f"aws fsx delete-file-system"
+                f" --file-system-id {fsx_id}"
+                f' --query "FileSystem.FileSystemId"'
+                f" --output text"
+            ).stdout.strip()
+
+            print(f"Deleted FSx filesystem: {fsx_id}")
+
+        except Exception as e:
+            LOGGER.error(f"Failed to create FSx filesystem: {e}")
+            raise
+
+    def wait_for_filesystem(self, filesystem_id: str):
+        """
+        Wait for FSx filesystem to become available and return its details
+        : param filesystem_id: FSx filesystem ID
+        : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name)
+        : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state
+        """
+        print(f"Waiting for FSx filesystem {filesystem_id} to be available...")
+        while True:
+            status = run(
+                f"aws fsx describe-file-systems --file-system-id {filesystem_id} "
+                f"--query 'FileSystems[0].Lifecycle' --output text"
+            ).stdout.strip()
+
+            if status == "AVAILABLE":
+                break
+            elif status in ["FAILED", "DELETING", "DELETED"]:
+                raise Exception(f"FSx filesystem entered {status} state")
+
+            print(f"FSx status: {status}, waiting...")
+            time.sleep(30)
+
+        # get fs DNS and mount name
+        fsx_dns = run(
+            f"aws fsx describe-file-systems --file-system-id {filesystem_id} "
+            f"--query 'FileSystems[0].DNSName' --output text"
+        ).stdout.strip()
+
+        fsx_mount = run(
+            f"aws fsx describe-file-systems --file-system-id {filesystem_id} "
+            f"--query 'FileSystems[0].LustreConfiguration.MountName' --output text"
+        ).stdout.strip()
+
+        return {"filesystem_id": filesystem_id, "dns_name": fsx_dns, "mount_name": fsx_mount}
+
+    def create_fsx_security_group(self, ec2_cli, vpc_id, group_name, description):
+        """
+        Create a security group for FSx Lustre and add inbound rules.
+
+        :param vpc_id: The ID of the VPC where the security group will be created
+        :param instance_id: The ID of the newly created EC2 instance
+        :param region_name: The AWS region name
+        :return: The ID of the created security group
+        """
+        try:
+            # Create the security group
+            response = ec2_cli.create_security_group(
+                GroupName=group_name,
+                Description=description,
+                VpcId=vpc_id,
+            )
+            sg_id = response["GroupId"]
+            print(f"Created security group: {sg_id}")
+
+            return sg_id
+
+        except ClientError as e:
+            print(f"An error occurred: {e}")
+            return None
+
+    def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids):
+        """
+        Add ingress rules to FSx security group for multiple instances
+
+        Args:
+            ec2_cli: boto3 EC2 client
+            sg_id: ID of the FSx security group
+            instance_ids: List of EC2 instance IDs
+        """
+        try:
+            # Get security group IDs for all instances
+            instance_sg_ids = set()
+            for instance_id in instance_ids:
+                response = ec2_cli.describe_instances(InstanceIds=[instance_id])
+                sg_id_instance = response["Reservations"][0]["Instances"][0]["SecurityGroups"][0][
+                    "GroupId"
+                ]
+                instance_sg_ids.add(sg_id_instance)
+
+            instance_group_pairs = [{"GroupId": sg} for sg in instance_sg_ids]
+
+            all_group_pairs = instance_group_pairs + [{"GroupId": sg_id}]
+
+            # Add inbound rules
+            ec2_cli.authorize_security_group_ingress(
+                GroupId=sg_id,
+                IpPermissions=[
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": 988,
+                        "ToPort": 1023,
+                        "UserIdGroupPairs": all_group_pairs,
+                    }
+                ],
+            )
+            print(
+                f"Added inbound rules to FSx security group {sg_id} for instance security groups: {instance_sg_ids}"
+            )
+
+        except Exception as e:
+            print(f"Error adding ingress rules: {str(e)}")
+            raise
+
+    def delete_security_group(self, ec2_cli, group_id: str):
+        """
+        Create a security group in the specified VPC
+        : param vpc_id: VPC ID where the security group will be created
+        : param name: name of the security group
+        : param description: description of the security group
+        : return: created security group ID
+        : raises: Exception if security group creation fails
+        """
+        try:
+            response = ec2_cli.delete_security_group(
+                GroupId=group_id,
+            )
+            sg_id = response["GroupId"]
+            print(f"Deleted security group: {sg_id}")
+
+        except Exception as e:
+            LOGGER.error(f"Failed to delete security group: {e}")
+            raise
+
+    def setup_csi_driver(self):
+        """
+        Install and configure the AWS FSx CSI Driver in the Kubernetes cluster
+        : return: None
+        : raises: Exception if driver installation or verification fails
+        """
+        try:
+            LOGGER.info("Installing AWS FSx CSI Driver...")
+            run(
+                "helm repo add aws-fsx-csi-driver https://kubernetes-sigs.github.io/aws-fsx-csi-driver/"
+            )
+            run("helm repo update")
+            run(
+                "helm install aws-fsx-csi-driver aws-fsx-csi-driver/aws-fsx-csi-driver --namespace kube-system"
+            )
+            run(
+                "kubectl wait --for=condition=ready pod -l app=fsx-csi-controller -n kube-system --timeout=300s"
+            )
+
+            self._verify_csi_driver()
+            LOGGER.info("FSx CSI Driver installed successfully")
+        except Exception as e:
+            LOGGER.error(f"Failed to setup FSx CSI driver: {e}")
+            raise
+
+    def _verify_csi_driver(self):
+        """
+        Verify that FSx CSI driver pods are running correctly in the cluster
+        : return: None
+        : raises: Exception if driver pods are not found or not running
+        """
+        result = run("kubectl get pods -n kube-system | grep fsx")
+
+        if "fsx-csi-controller" not in result.stdout or "fsx-csi-node" not in result.stdout:
+            raise Exception("FSx CSI driver pods not found")
+
+        fsx_pods = [
+            line
+            for line in result.stdout.split("\n")
+            if ("fsx-csi-controller" in line or "fsx-csi-node" in line) and "Running" in line
+        ]
+
+        if not fsx_pods:
+            raise Exception("No running FSx CSI driver pods found")
+
+        LOGGER.info(f"Found {len(fsx_pods)} running FSx CSI driver pods")
+
+    def setup_kubernetes_resources(
+        self, storage_class_file: str, pv_file: str, pvc_file: str, replacements: Dict[str, str]
+    ):
+        """
+        Setup Kubernetes FSx resources using provided yaml files and replacements
+        : param storage_class_file: path to the storage class yaml file
+        : param pv_file: path to the persistent volume yaml file
+        : param pvc_file: path to the persistent volume claim yaml file
+        : param replacements: dictionary of placeholder replacements
+                            Example: {"<subnet-id>": "subnet-xxx", "<sg-id>": "sg-xxx"}
+        : return: None
+        : raises: Exception if resource creation fails
+        """
+        try:
+            for file_path in [storage_class_file, pv_file, pvc_file]:
+                for key, value in replacements.items():
+                    run(f"sed -i 's|{key}|{value}|g' {file_path}")
+
+            for file_path in [storage_class_file, pv_file, pvc_file]:
+                run(f"kubectl apply -f {file_path}")
+
+            self.validate_kubernetes_resources()
+
+        except Exception as e:
+            LOGGER.error(f"Failed to setup Kubernetes FSx resources: {e}")
+            raise
+
+    def validate_kubernetes_resources(self):
+        """
+        Validate that FSx Kubernetes resources are properly created and bound
+        : return: True if all resources are validated successfully
+        : raises: Exception if any resource validation fails
+        """
+        try:
+            sc_result = run("kubectl get sc fsx-sc")
+            if "fsx-sc" not in sc_result.stdout or "fsx.csi.aws.com" not in sc_result.stdout:
+                raise Exception("FSx storage class not created correctly")
+
+            pv_result = run("kubectl get pv fsx-lustre-pv")
+            if "fsx-lustre-pv" not in pv_result.stdout or "Bound" not in pv_result.stdout:
+                raise Exception("FSx persistent volume not created correctly")
+
+            pvc_result = run("kubectl get pvc fsx-lustre-pvc")
+            if "fsx-lustre-pvc" not in pvc_result.stdout or "Bound" not in pvc_result.stdout:
+                raise Exception("FSx persistent volume claim not created correctly")
+
+            LOGGER.info("FSx Kubernetes resources validated successfully")
+            return True
+
+        except Exception as e:
+            LOGGER.error(f"FSx resource validation failed: {e}")
+            raise
diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
new file mode 100644
index 000000000000..5138ce4b20b9
--- /dev/null
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -0,0 +1,645 @@
+import os
+import time
+import uuid
+
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError, WaiterError
+from fabric import Connection
+
+# TODO: decide whether we want to copy test_utils to the new path
+from test import test_utils
+from test.test_utils import (
+    AL2023_BASE_DLAMI_ARM64_US_WEST_2,
+    DEFAULT_REGION,
+)
+from infra.test_infra.ec2.utils import (
+    get_default_vpc_id,
+    get_subnet_id_by_vpc,
+    get_ec2_client,
+    get_availability_zone_ids,
+    launch_efa_instances_with_retry,
+    check_instance_state,
+    check_system_state,
+    create_name_tags_for_instance,
+    get_num_efa_interfaces_for_instance_type,
+    get_network_interface_id,
+    attach_elastic_ip,
+    delete_elastic_ips,
+)
+from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
+from infra.test_infra.test_infra_utils import create_logger
+
+LOGGER = create_logger(__name__)
+
+# Constant to represent default region for boto3 commands
+DEFAULT_REGION = "us-west-2"
+EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole"
+ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true"
+
+# V2 test path constants
+V2_LOCAL_TEST_PATH = "test/v2"
+V2_INSTANCE_PATH = "$HOME/test_v2"
+V2_CONTAINER_PATH = "/test_v2"
+
+TEST_ID = str(uuid.uuid4())
+
+
+def ec2_instance_ami(region, image):
+    if "arm64" in image:
+        return AL2023_BASE_DLAMI_ARM64_US_WEST_2
+
+    return test_utils.get_dlami_id(region)
+
+
+def ec2_instance_type(image):
+    if "arm64" in image:
+        return "g5g.16xlarge"
+    else:
+        return "p4d.24xlarge"
+
+
+def availability_zone_options(ec2_client, ec2_instance_type, region):
+    """
+    Parametrize with a reduced list of availability zones for particular instance types for which
+    capacity has been reserved in that AZ. For other instance types, parametrize with list of all
+    AZs in the region.
+    :param ec2_client: boto3 Client for EC2
+    :param ec2_instance_type: str instance type for which AZs must be determined
+    :param region: str region in which instance must be created
+    :return: list of str AZ names
+    """
+    allowed_availability_zones = None
+    if ec2_instance_type in ["p4de.24xlarge"]:
+        if region == "us-east-1":
+            allowed_availability_zones = ["us-east-1d", "us-east-1c"]
+    if ec2_instance_type in ["p4d.24xlarge"]:
+        if region == "us-west-2":
+            allowed_availability_zones = ["us-west-2b", "us-west-2c"]
+    if not allowed_availability_zones:
+        allowed_availability_zones = get_availability_zone_ids(ec2_client)
+    return allowed_availability_zones
+
+
+def check_ip_rule_exists(security_group_rules, ip_address):
+    """
+    Check if an IP rule exists in security group rules
+    """
+    if not security_group_rules:
+        return False
+
+    for rule in security_group_rules:
+        if (
+            rule.get("FromPort") == 80
+            and rule.get("ToPort") == 80
+            and rule.get("IpProtocol") == "tcp"
+            and "IpRanges" in rule
+        ):
+            for ip_range in rule.get("IpRanges", []):
+                if ip_range.get("CidrIp") == f"{ip_address}/32":
+                    LOGGER.info(f"Found existing rule for IP {ip_address}")
+                    return True
+    return False
+
+
+def authorize_ingress(ec2_client, group_id, ip_address):
+    try:
+        response = ec2_client.describe_security_groups(GroupIds=[group_id])
+        if response.get("SecurityGroups") and response["SecurityGroups"]:
+            existing_rules = response["SecurityGroups"][0].get("IpPermissions", [])
+            if check_ip_rule_exists(existing_rules, ip_address):
+                LOGGER.info("Ingress rule already exists, skipping creation.")
+                return
+
+        ec2_client.authorize_security_group_ingress(
+            GroupId=group_id,
+            IpPermissions=[
+                {
+                    "IpProtocol": "tcp",
+                    "FromPort": 8000,
+                    "ToPort": 8000,
+                    "IpRanges": [
+                        {
+                            "CidrIp": f"{ip_address}/32",
+                            "Description": "Temporary access for vLLM testing",
+                        }
+                    ],
+                }
+            ],
+        )
+        LOGGER.info("Ingress rule added successfully.")
+    except ClientError as e:
+        LOGGER.error(f"Failed to authorize ingress: {str(e)}")
+        raise
+
+
+def setup_test_artifacts(ec2_client, instances, key_filename, region):
+    """
+    Setup test artifacts on EC2 instances
+    """
+    ec2_connections = {}
+    master_connection = None
+    worker_connection = None
+
+    for instance in instances:
+        instance_id = instance["InstanceId"]
+        try:
+            instance_details = ec2_client.describe_instances(InstanceIds=[instance_id])[
+                "Reservations"
+            ][0]["Instances"][0]
+            public_ip = instance_details.get("PublicIpAddress")
+
+            if not public_ip:
+                raise Exception(f"No public IP found for instance {instance_id}")
+
+            connection = Connection(
+                host=public_ip,
+                user="ec2-user",
+                connect_kwargs={"key_filename": key_filename},
+            )
+
+            # Test connection
+            connection.run('echo "Connection test"', hide=True)
+            ec2_connections[instance_id] = connection
+
+            if not master_connection:
+                master_connection = connection
+            else:
+                worker_connection = connection
+
+            print(f"Successfully connected to instance {instance_id}")
+
+        except Exception as e:
+            print(f"Failed to connect to instance {instance_id}: {str(e)}")
+            raise
+
+    artifact_folder = f"vllm-{TEST_ID}-folder"
+    s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder)
+
+    def delete_s3_artifact_copy():
+        test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location)
+
+    try:
+        # Setup master instance
+        if master_connection:
+            master_connection.run(f"rm -rf {V2_INSTANCE_PATH}")
+            master_connection.run(
+                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
+            )
+            print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
+            master_connection.run(
+                f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
+            )
+
+        if worker_connection:
+            worker_connection.run(f"rm -rf {V2_INSTANCE_PATH}")
+            worker_connection.run(
+                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
+            )
+            print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
+            worker_connection.run(
+                f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
+            )
+
+    finally:
+        delete_s3_artifact_copy()
+
+    if worker_connection:
+        return [master_connection, worker_connection]
+    return [master_connection]
+
+
+def launch_regular_instances_with_retry(
+    ec2_client,
+    ec2_instance_type,
+    availability_zone_options,
+    ec2_run_instances_definition,
+):
+    """
+    Launch regular (non-EFA) EC2 instances with retry capability
+    """
+    instances = None
+    error = None
+
+    for a_zone in availability_zone_options:
+        ec2_run_instances_definition["Placement"] = {"AvailabilityZone": a_zone}
+        try:
+            instances = ec2_client.run_instances(**ec2_run_instances_definition)["Instances"]
+            if instances:
+                break
+        except ClientError as e:
+            LOGGER.error(f"Failed to launch in {a_zone} due to {e}")
+            error = e
+            continue
+
+    if not instances:
+        raise error or Exception("Failed to launch instances in any availability zone")
+
+    return instances
+
+
+def efa_ec2_instances(
+    ec2_client,
+    ec2_instance_type,
+    ec2_instance_role_name,
+    ec2_key_name,
+    ec2_instance_ami,
+    region,
+    availability_zone_options,
+    is_arm64,
+):
+    instances = None
+    key_filename = None
+    elastic_ip_allocation_ids = []
+    is_efa = not is_arm64
+
+    try:
+        ec2_key_name = f"{ec2_key_name}-{TEST_ID}"
+        print(f"Creating instance: CI-CD {ec2_key_name}")
+        key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
+        volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda"
+
+        instance_name_prefix = f"CI-CD {ec2_key_name}"
+        ec2_run_instances_definition = {
+            "BlockDeviceMappings": [
+                {
+                    "DeviceName": volume_name,
+                    "Ebs": {
+                        "DeleteOnTermination": True,
+                        "VolumeSize": 600,
+                        "VolumeType": "gp3",
+                        "Iops": 3000,
+                        "Throughput": 125,
+                    },
+                },
+            ],
+            "ImageId": ec2_instance_ami,
+            "InstanceType": ec2_instance_type,
+            "IamInstanceProfile": {"Name": ec2_instance_role_name},
+            "KeyName": ec2_key_name,
+            "MaxCount": 2 if is_efa else 1,
+            "MinCount": 2 if is_efa else 1,
+            "TagSpecifications": [
+                {
+                    "ResourceType": "instance",
+                    "Tags": [{"Key": "Name", "Value": instance_name_prefix}],
+                }
+            ],
+        }
+
+        if is_efa:
+            instances = launch_efa_instances_with_retry(
+                ec2_client,
+                ec2_instance_type,
+                availability_zone_options,
+                ec2_run_instances_definition,
+            )
+        else:
+            instances = launch_regular_instances_with_retry(
+                ec2_client,
+                ec2_instance_type,
+                availability_zone_options,
+                ec2_run_instances_definition,
+            )
+
+        master_instance_id = instances[0]["InstanceId"]
+        check_instance_state(master_instance_id, state="running", region=region)
+        check_system_state(
+            master_instance_id, system_status="ok", instance_status="ok", region=region
+        )
+        print(f"Master instance {master_instance_id} is ready")
+        create_name_tags_for_instance(master_instance_id, f"{instance_name_prefix}_master", region)
+        if is_efa:
+            for i in range(1, len(instances)):
+                worker_instance_id = instances[i]["InstanceId"]
+                create_name_tags_for_instance(
+                    worker_instance_id, f"{instance_name_prefix}_worker_{i}", region
+                )
+                check_instance_state(worker_instance_id, state="running", region=region)
+                check_system_state(
+                    worker_instance_id, system_status="ok", instance_status="ok", region=region
+                )
+                print(f"Worker instance {worker_instance_id} is ready")
+
+            num_efa_interfaces = get_num_efa_interfaces_for_instance_type(
+                ec2_instance_type, region=region
+            )
+
+            print(num_efa_interfaces)
+
+            if num_efa_interfaces > 1:
+                for instance in instances:
+                    try:
+                        instance_id = instance["InstanceId"]
+
+                        network_interface_id = get_network_interface_id(instance_id, region)
+                        elastic_ip_allocation_id = attach_elastic_ip(
+                            network_interface_id, region, ENABLE_IPV6_TESTING
+                        )
+                        elastic_ip_allocation_ids.append(elastic_ip_allocation_id)
+                    except Exception as e:
+                        if elastic_ip_allocation_ids:
+                            delete_elastic_ips(elastic_ip_allocation_ids, ec2_client)
+                        raise Exception(f"Error allocating elastic IP: {str(e)}")
+
+        connections = setup_test_artifacts(ec2_client, instances, key_filename, region)
+        return_val = {
+            "instances": [
+                (instance_info["InstanceId"], key_filename) for instance_info in instances
+            ],
+            "elastic_ips": elastic_ip_allocation_ids,
+            "connections": connections,
+        }
+        print("Launched EFA Test instances")
+        return return_val
+
+    except Exception as e:
+        print(f"Error in efa_ec2_instances: {str(e)}")
+        # Clean up elastic IPs
+        if elastic_ip_allocation_ids:
+            try:
+                delete_elastic_ips(elastic_ip_allocation_ids, ec2_client)
+            except Exception as cleanup_error:
+                print(f"Error cleaning up elastic IPs: {str(cleanup_error)}")
+
+        # Clean up instances
+        if instances:
+            try:
+                instance_ids = [instance["InstanceId"] for instance in instances]
+                ec2_client.terminate_instances(InstanceIds=instance_ids)
+                # Wait for instances to terminate
+                waiter = ec2_client.get_waiter("instance_terminated")
+                waiter.wait(InstanceIds=instance_ids)
+            except Exception as cleanup_error:
+                print(f"Error terminating instances: {str(cleanup_error)}")
+
+        # Clean up key pair
+        if key_filename:
+            try:
+                if os.path.exists(key_filename):
+                    os.remove(key_filename)
+                if os.path.exists(f"{key_filename}.pub"):
+                    os.remove(f"{key_filename}.pub")
+            except Exception as cleanup_error:
+                print(f"Error cleaning up key files: {str(cleanup_error)}")
+
+        raise
+
+
+def _setup_instance(connection, fsx_dns_name, mount_name):
+    """
+    Setup FSx mount and VLLM environment on an instance synchronously
+    """
+    os.chdir("..")
+    # Copy script to instance
+    connection.put("vllm/ec2/utils/setup_fsx_vllm.sh", "/home/ec2-user/setup_fsx_vllm.sh")
+
+    # Make script executable and run it
+    commands = [
+        "chmod +x /home/ec2-user/setup_fsx_vllm.sh",
+        f"/home/ec2-user/setup_fsx_vllm.sh {fsx_dns_name} {mount_name}",
+    ]
+
+    # Execute commands synchronously
+    result = connection.run("; ".join(commands))
+    return result
+
+
+def cleanup_resources(ec2_cli, resources, fsx):
+    """Cleanup all resources in reverse order of creation"""
+    cleanup_errors = []
+
+    def wait_for_instances(instance_ids):
+        waiter = ec2_cli.get_waiter("instance_terminated")
+        try:
+            waiter.wait(InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 100})
+            return True
+        except WaiterError as e:
+            print(f"Warning: Instance termination waiter timed out: {str(e)}")
+            return False
+
+    if resources.get("elastic_ips"):
+        try:
+            delete_elastic_ips(resources["elastic_ips"], ec2_cli)
+            print(f"Deleted elastic IPs: {resources['elastic_ips']}")
+        except Exception as e:
+            cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}")
+
+    if resources.get("instances_info"):
+        try:
+            instance_ids = [instance_id for instance_id, _ in resources["instances_info"]]
+            ec2_cli.terminate_instances(InstanceIds=instance_ids)
+            print(f"Terminating instances: {instance_ids}")
+
+            if not wait_for_instances(instance_ids):
+                cleanup_errors.append("Instances did not terminate within expected timeframe")
+
+            for _, key_filename in resources["instances_info"]:
+                if key_filename:
+                    try:
+                        ec2_cli.delete_key_pair(KeyName=key_filename)
+                        for ext in ["", ".pub"]:
+                            file_path = f"{key_filename}{ext}"
+                            if os.path.exists(file_path):
+                                os.remove(file_path)
+                    except Exception as e:
+                        cleanup_errors.append(f"Failed to delete key file: {str(e)}")
+        except Exception as e:
+            cleanup_errors.append(f"Failed to cleanup EC2 resources: {str(e)}")
+
+    if resources.get("fsx_config"):
+        try:
+            fsx.delete_fsx_filesystem(resources["fsx_config"]["filesystem_id"])
+            print(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}")
+        except Exception as e:
+            cleanup_errors.append(f"Failed to delete FSx filesystem: {str(e)}")
+
+    time.sleep(30)
+
+    if resources.get("sg_fsx"):
+        max_attempts = 10
+        for attempt in range(max_attempts):
+            try:
+                ec2_cli.delete_security_group(GroupId=resources["sg_fsx"])
+                print(f"Deleted security group: {resources['sg_fsx']}")
+                break
+            except Exception as e:
+                if attempt == max_attempts - 1:
+                    cleanup_errors.append(
+                        f"Failed to delete security group after {max_attempts} attempts: {str(e)}"
+                    )
+                else:
+                    print(f"Retry {attempt + 1}/{max_attempts} to delete security group")
+                    time.sleep(30)
+
+    if cleanup_errors:
+        raise Exception("Cleanup errors occurred:\n" + "\n".join(cleanup_errors))
+
+
+def launch_ec2_instances(ec2_cli, image):
+    """Launch EC2 instances with EFA support"""
+    instance_type = ec2_instance_type(image)
+    ami_id = ec2_instance_ami(DEFAULT_REGION, image)
+    az_options = availability_zone_options(ec2_cli, instance_type, DEFAULT_REGION)
+    is_arm64 = True if "arm64" in image else False
+
+    instances_info = efa_ec2_instances(
+        ec2_client=ec2_cli,
+        ec2_instance_type=instance_type,
+        ec2_instance_role_name=EC2_INSTANCE_ROLE_NAME,
+        ec2_key_name="vllm-ec2-test",
+        ec2_instance_ami=ami_id,
+        region=DEFAULT_REGION,
+        availability_zone_options=az_options,
+        is_arm64=is_arm64,
+    )
+    print(f"Launched instances: {instances_info}")
+    return instances_info
+
+
+def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info):
+    """
+    Configure security groups for FSx and EC2 instances
+
+    Args:
+        ec2_cli: boto3 EC2 client
+        fsx: FsxSetup instance
+        vpc_id: VPC ID where security group will be created
+        instances_info: List of tuples containing (instance_id, key_filename)
+
+    Returns:
+        str: FSx security group ID
+    """
+    try:
+        fsx_name = f"fsx-lustre-vllm-ec2-test-sg-{instance_id}-{TEST_ID}"
+        # Create FSx security group
+        sg_fsx = fsx.create_fsx_security_group(
+            ec2_cli,
+            vpc_id,
+            fsx_name,
+            "Security group for FSx Lustre VLLM EC2 Tests",
+        )
+        print(f"Created FSx security group: {sg_fsx}")
+
+        # Get instance IDs from instances_info
+        instance_ids = [instance_id for instance_id, _ in instances_info]
+
+        # Add security group rules
+        fsx.add_ingress_rules_sg(ec2_cli, sg_fsx, instance_ids)
+
+        return sg_fsx
+
+    except Exception as e:
+        print(f"Error configuring security groups: {str(e)}")
+        raise
+
+
+def setup_instance(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name):
+    """Setup FSx mount on a single instance"""
+    instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][
+        "Instances"
+    ][0]
+    public_ip = instance_details.get("PublicIpAddress")
+
+    if not public_ip:
+        raise Exception(f"No public IP found for instance {instance_id}")
+
+    connection = Connection(
+        host=public_ip,
+        user="ec2-user",
+        connect_kwargs={"key_filename": key_filename},
+    )
+
+    return _setup_instance(connection, fsx_dns_name, mount_name)
+
+
+def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name):
+    """Mount FSx on worker instance without running setup script"""
+    instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][
+        "Instances"
+    ][0]
+    public_ip = instance_details.get("PublicIpAddress")
+
+    if not public_ip:
+        raise Exception(f"No public IP found for instance {instance_id}")
+
+    connection = Connection(
+        host=public_ip,
+        user="ec2-user",
+        connect_kwargs={"key_filename": key_filename},
+    )
+
+    commands = [
+        "sudo yum install -y lustre-client",
+        "sudo mkdir -p /fsx",
+        f"sudo mount -t lustre -o relatime,flock {fsx_dns_name}@tcp:/{mount_name} /fsx",
+    ]
+
+    for cmd in commands:
+        connection.run(cmd)
+
+
+def setup(image):
+    """Main setup function for VLLM on EC2 with FSx"""
+    print("Testing vllm on ec2........")
+    fsx = FsxSetup(DEFAULT_REGION)
+    ec2_cli = get_ec2_client(DEFAULT_REGION)
+    resources = {"instances_info": None, "fsx_config": None, "sg_fsx": None}
+
+    try:
+        vpc_id = get_default_vpc_id(ec2_cli)
+        subnet_ids = get_subnet_id_by_vpc(ec2_cli, vpc_id)
+
+        instance_result = launch_ec2_instances(ec2_cli, image)
+        resources["instances_info"] = instance_result["instances"]
+        resources["elastic_ips"] = instance_result["elastic_ips"]
+        resources["connections"] = instance_result["connections"]
+        print("Waiting 60 seconds for instances to initialize...")
+        time.sleep(60)
+
+        instance_ids = [instance_id for instance_id, _ in resources["instances_info"]]
+        resources["sg_fsx"] = configure_security_groups(
+            instance_ids[0], ec2_cli, fsx, vpc_id, resources["instances_info"]
+        )
+
+        # Create FSx filesystem
+        resources["fsx_config"] = fsx.create_fsx_filesystem(
+            subnet_ids[0],
+            [resources["sg_fsx"]],
+            1200,
+            "SCRATCH_2",
+            {"Name": f"fsx-lustre-vllm-ec2-test-{instance_ids[0]}-{TEST_ID}"},
+        )
+        print("Created FSx filesystem")
+
+        master_instance_id, master_key_filename = resources["instances_info"][0]
+        setup_instance(
+            master_instance_id,
+            master_key_filename,
+            ec2_cli,
+            resources["fsx_config"]["dns_name"],
+            resources["fsx_config"]["mount_name"],
+        )
+        print(f"Setup completed for master instance {master_instance_id}")
+
+        if len(resources["instances_info"]) > 1:
+            worker_instance_id, worker_key_filename = resources["instances_info"][1]
+            mount_fsx_on_worker(
+                worker_instance_id,
+                worker_key_filename,
+                ec2_cli,
+                resources["fsx_config"]["dns_name"],
+                resources["fsx_config"]["mount_name"],
+            )
+            print(f"FSx mounted on worker instance {worker_instance_id}")
+
+        return resources
+
+    except Exception as e:
+        print(f"Error during setup: {str(e)}")
+        cleanup_resources(ec2_cli, resources, fsx)
+        raise
+
+
+if __name__ == "__main__":
+    setup()
diff --git a/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh b/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh
new file mode 100644
index 000000000000..685b1706c432
--- /dev/null
+++ b/infra/test_infra/ec2/vllm/setup_fsx_vllm.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# setup_fsx_vllm.sh
+# Script to mount FSx and setup VLLM environment
+
+
+# Get FSx DNS name from argument
+FSX_DNS_NAME=$1
+MOUNT_NAME=$2
+
+# Function to log messages with hostname
+log() {
+    local HOSTNAME=$(hostname)
+    echo "[Host ${HOSTNAME}] $1"
+}
+
+
+# Function to check if command was successful
+check_error() {
+    if [ $? -ne 0 ]; then
+        echo "Error: $1"
+        exit 1
+    fi
+}
+
+if [ -z "$FSX_DNS_NAME" ] || [ -z "$MOUNT_NAME" ]; then
+    echo "Usage: $0 <FSX_DNS_NAME> <MOUNT_NAME>"
+    exit 1
+fi
+
+# Install required packages
+log "Installing required packages..."
+sudo yum install -y nfs-utils git
+check_error "Failed to install base packages"
+
+
+# Install the latest Lustre client
+log "Installing latest Lustre client..."
+sudo yum install -y lustre-client
+check_error "Failed to install Lustre client"
+
+
+# Create FSx mount directory
+log "Creating FSx mount directory..."
+sudo mkdir -p /fsx
+check_error "Failed to create /fsx directory"
+
+
+# Modify mount command to include verbose output
+sudo mount -t lustre -o relatime,flock ${FSX_DNS_NAME}@tcp:/${MOUNT_NAME} /fsx
+
+# Create VLLM directory in FSx
+log "Creating VLLM directory..."
+sudo mkdir -p /fsx/vllm-dlc
+
+check_error "Failed to create /fsx/vllm-dlc directory"
+
+# Set proper permissions
+log "Setting proper permissions..."
+sudo chown -R ec2-user:ec2-user /fsx/vllm-dlc
+check_error "Failed to set permissions"
+
+cd /fsx/vllm-dlc
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout tags/v0.10.2
+
+# Download ShareGPT dataset
+log "Downloading ShareGPT dataset..."
+cd /fsx/vllm-dlc && wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+check_error "Failed to download ShareGPT dataset"
+
+log "Setup completed successfully!"
+
+    
\ No newline at end of file

From a0c90df5bfd534c062c359291fee25f67496b45d Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 12:32:22 -0700
Subject: [PATCH 04/33] add v2 path for EFA tests

---
 test/v2/ec2/efa/__init__.py   |   1 +
 test/v2/ec2/efa/testEFA       | 103 ++++++++
 test/v2/ec2/efa/testEFASanity |  27 +++
 test/v2/ec2/efa/test_efa.py   | 437 ++++++++++++++++++++++++++++++++++
 4 files changed, 568 insertions(+)
 create mode 100644 test/v2/ec2/efa/__init__.py
 create mode 100644 test/v2/ec2/efa/testEFA
 create mode 100644 test/v2/ec2/efa/testEFASanity
 create mode 100644 test/v2/ec2/efa/test_efa.py

diff --git a/test/v2/ec2/efa/__init__.py b/test/v2/ec2/efa/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/test/v2/ec2/efa/__init__.py
@@ -0,0 +1 @@
+
diff --git a/test/v2/ec2/efa/testEFA b/test/v2/ec2/efa/testEFA
new file mode 100644
index 000000000000..4b676249d816
--- /dev/null
+++ b/test/v2/ec2/efa/testEFA
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+set -ex
+
+NUM_HOSTS_FILE=$1
+NUM_HOSTS=$2
+IS_IPV6=$3
+
+if [[ -z "${CUDA_HOME}" ]]; then
+    echo "CUDA_HOME variable is empty, please define it in dockerfile"
+    exit 1
+fi
+
+TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
+INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type)
+
+GPU_COUNT=$(nvidia-smi -L | wc -l)
+NODES=$(($GPU_COUNT * $NUM_HOSTS))
+
+
+PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME)
+TRAINING_LOG="/test/logs/testEFA.log"
+
+USE_DEVICE_RDMA_ARG=""
+
+# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html
+# g5.24xlarge we use in RC is not RDMA read supported
+if [[ ${INSTANCE_TYPE} == p4d.24xlarge || ${INSTANCE_TYPE} == p4de.24xlarge  || ${INSTANCE_TYPE} == p5.48xlarge ]]; then
+  USE_DEVICE_RDMA_ARG="-x FI_EFA_USE_DEVICE_RDMA=1"
+fi
+
+validate_all_reduce_performance_logs(){
+    grep "aws-ofi-nccl" ${TRAINING_LOG} || { echo "aws-ofi-nccl is not working, please check if it is installed correctly"; exit 1; }
+    grep -i "NET/OFI Selected [Pp]rovider is efa" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
+    # EFA 1.37.0 using "Using network Libfabric" instead of "Using network AWS Libfabric"
+    grep -E "Using network (AWS )?Libfabric" ${TRAINING_LOG} || { echo "efa is not working, please check if it is installed correctly"; exit 1; }
+    if [[ ${INSTANCE_TYPE} == p4d* || ${INSTANCE_TYPE} == p5* ]]; then
+        grep "NCCL_TOPO_FILE set by environment to" ${TRAINING_LOG}
+        # EFA 1.37.0 change from NET/AWS Libfabric/0/GDRDMA to NET/Libfabric/0/GDRDMA
+        grep -E "NET/(AWS )?Libfabric/0/GDRDMA" ${TRAINING_LOG}
+    fi
+}
+
+check_efa_nccl_all_reduce_performance(){
+    benchmark=$(cat $TRAINING_LOG | grep '1073741824' | tail -n1 | awk -F " " '{{print $11}}' | sed 's/ //' | sed  's/  5e-07//')
+    echo "Benchmark throughput: ${benchmark}"
+    if [[ -z "${benchmark}" ]]; then
+        echo "benchmark variable is empty"
+        exit 1
+    fi
+
+    # The standard throughput should be at least 41 for 2 p4d with 4 EFA devices
+    # However, if the 2 instances are not in the same A-Z in the same region, performance can decrease.
+    # To account for this we need to modify thresholds dynamically based on where instances are.
+    # Temporarily setting these to be < 50% of optimal until AWS OFI NCCL team has concrete numbers for this.
+    PERFORMANCE_THRESHOLD="3"
+
+    if [[ $(echo "$benchmark $PERFORMANCE_THRESHOLD" | awk '{print ($1 >= $2)}') == 1 ]]; then
+        echo "***************************** check_efa_nccl_all_reduce_performance passed *****************************"
+    else
+        echo "***************************** check_efa_nccl_all_reduce_performance failed *****************************"
+        exit 1
+    fi
+}
+
+check_efa_nccl_all_reduce(){
+    echo "Running all_reduce_perf test"
+
+    if [[ ${IS_IPV6} == "True" ]]; then
+        echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6"
+        mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
+            -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
+            -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
+            -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
+            -x NCCL_SOCKET_FAMILY=AF_INET6 \
+            /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
+    else
+        echo "Running all_reduce_perf test with IPv4: using default IPv4 mode"
+        # Need to pass -x PATH because rank non-zero nodes seem to "forget" the value of PATH that is pre-configured into
+        # the container. Not using full-paths of mpirun and other executables because these paths can change across PyTorch
+        # versions in DLC images.
+        mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
+            -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
+            -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
+            -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
+            /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
+    fi
+    
+    RETURN_VAL=${PIPESTATUS[0]}
+    # In case, if you would like see logs, uncomment below line
+    # RESULT=$(cat ${TRAINING_LOG})
+
+    if [ ${RETURN_VAL} -eq 0 ]; then
+        echo "***************************** check_efa_nccl_all_reduce passed *****************************"
+    else
+        echo "***************************** check_efa_nccl_all_reduce failed *****************************"
+    fi
+    validate_all_reduce_performance_logs
+    check_efa_nccl_all_reduce_performance
+}
+
+check_efa_nccl_all_reduce
diff --git a/test/v2/ec2/efa/testEFASanity b/test/v2/ec2/efa/testEFASanity
new file mode 100644
index 000000000000..1f350628c668
--- /dev/null
+++ b/test/v2/ec2/efa/testEFASanity
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+set -ex
+
+export PATH=/opt/amazon/efa/bin:$PATH
+
+# check if efa components are correctly installed
+cat /opt/amazon/efa_installed_packages
+
+# check Libfabric EFA interfaces
+fi_info -p efa
+fi_info -p efa -t FI_EP_RDM | grep 'FI_EP_RDM'
+
+apt-get update && apt-get install -y kmod
+
+# check if ib_uverbs is present
+lsmod | grep ib_uverbs
+
+# ensure that the security group created is configured correctly
+/test/bin/efa/efa_test.sh
+
+# Queries local RDMA devices
+ibv_devinfo
+
+# check if gdr device is loaded
+cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA'
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
new file mode 100644
index 000000000000..dded29bdffcc
--- /dev/null
+++ b/test/v2/ec2/efa/test_efa.py
@@ -0,0 +1,437 @@
+import os
+
+import pytest
+
+import test.test_utils.ec2 as ec2_utils
+from test.test_utils import (
+    CONTAINER_TESTS_PREFIX_V2,
+    get_account_id_from_image_uri,
+    get_region_from_image_uri,
+    is_pr_context,
+    is_efa_dedicated,
+    are_heavy_instance_ec2_tests_enabled,
+    login_to_ecr_registry,
+    run_cmd_on_container,
+)
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+
+from infra.test_infra.ec2.utils import (
+    get_efa_ec2_instance_type,
+    filter_efa_instance_type,
+    filter_efa_only_p4_instance_type,
+)
+
+BUILD_ALL_REDUCE_PERF_CMD = os.path.join(
+    CONTAINER_TESTS_PREFIX_V2, "efa", "build_all_reduce_perf.sh"
+)
+EFA_SANITY_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX_V2, "efa", "testEFASanity")
+EFA_INTEGRATION_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX_V2, "efa", "testEFA")
+EFA_PYTORCH_HEALTHCHECK_TEST_CMD = os.path.join(
+    CONTAINER_TESTS_PREFIX_V2, "healthcheck_tests", "efa_checker_single_node.sh"
+)
+
+ENABLE_IPV6_TESTING = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true"
+
+MASTER_SSH_KEY_NAME = "master_id_rsa"
+WORKER_SSH_KEY_NAME = "worker_id_rsa"
+MASTER_CONTAINER_NAME = "master_container"
+WORKER_CONTAINER_NAME = "worker_container"
+HOSTS_FILE_LOCATION = "/root/hosts"
+
+DEFAULT_EFA_TIMEOUT = 300
+
+
+def get_vllm_container_name(test_scenario, arch_type, node_role=None):
+    """
+    Generate unique container name for vLLM v2 EC2 tests
+
+    Args:
+        test_scenario: Test scenario (e.g., "efa", "single-node")
+        arch_type: Architecture from buildspec (e.g., "x86_64", "arm64")
+        node_role: For multi-node: "master", "worker-0", etc. (optional)
+
+    Returns:
+        Container name like "vllm-ec2-efa-x86_64-master" or "vllm-ec2-single-node-arm64"
+    """
+    base_name = f"vllm-ec2-{test_scenario}-{arch_type}"
+    return f"{base_name}-{node_role}" if node_role else base_name
+
+
+EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type(
+    default="p4d.24xlarge",
+    filter_function=filter_efa_instance_type,
+)
+
+EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type(
+    default="p4d.24xlarge",
+    filter_function=filter_efa_only_p4_instance_type,
+)
+
+
+# TODO: decide on whether to keep this commented out or left out until actual implementation of each framework
+# def test_pytorch_efa(
+#     pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only
+# ):
+#     """
+#     Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA
+#     on multiple nodes using DLC images. The test scripts are agnostic to the framework and version
+#     installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf
+#     binary necessary for multinode tests, on each node.
+#     Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances
+#     on pipelines.
+#     :param pytorch_training: str PyTorch Training DLC image URI
+#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+#     :param ec2_instance_type: str Instance Type being tested
+#     :param region: str Region in which EFA-enabled instances are launched
+#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
+#     """
+#     number_of_nodes = 2
+#     _setup_multinode_efa_instances(
+#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+#     )
+#     master_connection = efa_ec2_connections[0]
+#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+
+#     ipv6_arg = "True" if ENABLE_IPV6_TESTING else ""
+
+#     run_cmd_on_container(
+#         MASTER_CONTAINER_NAME,
+#         master_connection,
+#         f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}",
+#         hide=False,
+#         timeout=DEFAULT_EFA_TIMEOUT,
+#     )
+
+
+# def test_efa_tensorflow(
+#     tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only
+# ):
+#     """
+#     Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA
+#     on multiple nodes using DLC images. The test scripts are agnostic to the framework and version
+#     installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf
+#     binary necessary for multinode tests, on each node.
+#     Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances
+#     on pipelines.
+#     :param tensorflow_training: str PyTorch Training DLC image URI
+#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+#     :param ec2_instance_type: str Instance Type being tested
+#     :param region: str Region in which EFA-enabled instances are launched
+#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
+#     """
+#     number_of_nodes = 2
+#     _setup_multinode_efa_instances(
+#         tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+#     )
+#     master_connection = efa_ec2_connections[0]
+#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+
+#     # pass IPv6 flag if enabled
+#     ipv6_arg = "True" if ENABLE_IPV6_TESTING else ""
+
+#     run_cmd_on_container(
+#         MASTER_CONTAINER_NAME,
+#         master_connection,
+#         f"export CUDA_HOME='/usr/local/cuda'; {EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}",
+#         hide=False,
+#         timeout=DEFAULT_EFA_TIMEOUT,
+#     )
+
+
+# def test_pytorch_efa_healthcheck(
+#     pytorch_training,
+#     efa_ec2_instances,
+#     efa_ec2_connections,
+#     ec2_instance_type,
+#     region,
+#     gpu_only,
+# ):
+#     """
+#     Run EFA Health Check tests on DLC.
+#     :param pytorch_training: str PyTorch Training DLC image URI
+#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+#     :param ec2_instance_type: str Instance Type being tested
+#     :param region: str Region in which EFA-enabled instances are launched
+#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
+#     """
+#     _setup_multinode_efa_instances(
+#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+#     )
+#     master_connection = efa_ec2_connections[0]
+#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+#     run_cmd_on_container(
+#         MASTER_CONTAINER_NAME,
+#         master_connection,
+#         f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
+#         hide=False,
+#         timeout=DEFAULT_EFA_TIMEOUT,
+#     )
+
+
+def _setup_multinode_efa_instances(
+    image, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, arch_type=None
+):
+    """
+    Pull and start test image containers on both master and worker instances, configure
+    password-less SSH between master and worker nodes, and build all_reduce_perf binary on
+    master and worker nodes.
+    :param image: str DLC image URI to be tested
+    :param efa_ec2_instances: list of tuples of instance_id, keypair_filepath for each instance
+    :param efa_ec2_connections: list of fabric connection objects
+    :param ec2_instance_type: str instance type being used
+    :param region: str region name in which test is being run
+    :param arch_type: str architecture type (e.g., "x86_64", "arm64")
+    """
+    # Asynchronously pull docker image on all instances
+    _pull_image_on_all_instances(efa_ec2_connections, image)
+    # Configure master node container
+    master_connection = efa_ec2_connections[0]
+
+    # Determine container names - use unique names for vLLM, standard names for others
+    if "vllm" in image:
+        # Use provided arch_type or infer from image as fallback
+        if arch_type is None:
+            arch_type = "arm64" if "arm64" in image else "x86_64"
+        master_container_name = get_vllm_container_name("efa", arch_type, "master")
+    else:
+        master_container_name = MASTER_CONTAINER_NAME
+
+    build_all_reduce_perf_promises = []
+    # Run container
+    _setup_container(master_connection, image, master_container_name)
+    # Build all_reduce_perf binary using nccl-tests
+    promise = run_cmd_on_container(
+        master_container_name,
+        master_connection,
+        BUILD_ALL_REDUCE_PERF_CMD,
+        timeout=DEFAULT_EFA_TIMEOUT,
+        asynchronous=True,
+    )
+    build_all_reduce_perf_promises.append(promise)
+
+    for idx, worker_connection in enumerate(efa_ec2_connections[1:]):
+        # Determine worker container name
+        if "vllm" in image:
+            worker_container_name = get_vllm_container_name("efa", arch_type, f"worker-{idx}")
+        else:
+            worker_container_name = WORKER_CONTAINER_NAME
+
+        # Run container
+        _setup_container(worker_connection, image, worker_container_name)
+        # Build all_reduce_perf binary using nccl-tests
+        promise = run_cmd_on_container(
+            worker_container_name,
+            worker_connection,
+            BUILD_ALL_REDUCE_PERF_CMD,
+            timeout=DEFAULT_EFA_TIMEOUT,
+            asynchronous=True,
+        )
+        build_all_reduce_perf_promises.append(promise)
+
+    # Configure master node SSH client-side configurations
+    _setup_master_efa_ssh_config(master_connection)
+    # Create a hosts file that provides mpi with IP addresses and no. of GPUs in each node
+    worker_instance_ids = [instance_id for instance_id, _ in efa_ec2_instances[1:]]
+    _create_master_mpi_hosts_file(
+        efa_ec2_connections, worker_instance_ids, ec2_instance_type, region
+    )
+    # Obtain master node SSH public key for future use
+    master_pub_key = run_cmd_on_container(
+        MASTER_CONTAINER_NAME, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub"
+    ).stdout.strip("\n")
+
+    # Configure worker node containers
+    for worker_connection in efa_ec2_connections[1:]:
+        # Configure worker node SSH server-side configurations, launch SSH daemon, and allow
+        # password-less SSH access from master to worker nodes.
+        _setup_worker_efa_ssh_config(worker_connection, master_pub_key)
+
+    # Wait for all_reduce_perf binaries to be built in all containers
+    for promise in build_all_reduce_perf_promises:
+        promise.join()
+
+
+def _pull_image_on_all_instances(connections, docker_image):
+    """
+    Asynchronously pull tested image on all instances
+    :param connections: list of Fabric Connection objects
+    :param docker_image: str DLC image URI to be tested
+    """
+    account_id = get_account_id_from_image_uri(docker_image)
+    region = get_region_from_image_uri(docker_image)
+
+    for conn in connections:
+        login_to_ecr_registry(conn, account_id, region)
+
+    promises = [conn.run(f"docker pull {docker_image}", asynchronous=True) for conn in connections]
+    for prom in promises:
+        prom.join()
+
+
+def _setup_container(connection, docker_image, container_name):
+    """
+    Pull and run tested image with all EFA devices made available to container
+    :param connection: Fabric Connection object
+    :param docker_image: str DLC image URI to be tested
+    :param container_name: str container name
+    """
+    devices = ec2_utils.get_efa_devices_on_instance(connection)
+    docker_devices_args = [f"--device {device_location}" for device_location in devices]
+    docker_all_devices_arg = " ".join(docker_devices_args)
+
+    # Remove pre-existing containers if reusing an instance
+    connection.run(f"docker rm -f {container_name}", hide=True)
+
+    # Use network mode host, rather than the default "bridge" to allow direct access to container
+    # using SSH on a pre-defined port (as decided by sshd_config on server-side).
+    # Allow instance to share all memory with container using memlock=-1:-1.
+    # Share all EFA devices with container using --device <device_location> for all EFA devices.
+    if "vllm" in docker_image:
+        connection.run(
+            f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
+            f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image}"
+        )
+    else:
+        connection.run(
+            f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
+            f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image} bash"
+        )
+
+
+def _setup_master_efa_ssh_config(connection):
+    """
+    Set up SSH client config on master container to connect to worker
+    :param connection: Fabric Connection object
+    """
+    run_cmd_on_container(
+        MASTER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*"
+    )
+    # When running container in --network=host, the container hostname changes, requiring
+    # a new SSH key to be generated.
+    run_cmd_on_container(
+        MASTER_CONTAINER_NAME,
+        connection,
+        f"""ssh-keygen -t rsa -f $HOME/.ssh/{MASTER_SSH_KEY_NAME} -N "" """,
+    )
+    # Configure SSH client-side to always use newly created key, and use port 2022, since this is
+    # the port configured in the worker node SSH daemon.
+    master_container_ssh_config = (
+        "Host *\n"
+        f" IdentityFile /root/.ssh/{MASTER_SSH_KEY_NAME}\n"
+        " StrictHostKeyChecking no\n"
+        " UserKnownHostsFile /dev/null\n"
+        " Port 2022"
+    )
+    run_cmd_on_container(
+        MASTER_CONTAINER_NAME,
+        connection,
+        f"""echo -e "{master_container_ssh_config}" > $HOME/.ssh/config""",
+    )
+    run_cmd_on_container(MASTER_CONTAINER_NAME, connection, "chmod -R 600 $HOME/.ssh/*")
+
+
+def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region):
+    """
+    Create MPI Hosts file that contains private IP addresses of all hosts used in training job.
+    :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections]
+    :param worker_instance_ids: list of str worker instance IDs
+    :param instance_type: str EC2 Instance Type being used
+    :param region: str region name in which test is run
+    """
+    master_connection = efa_ec2_connections[0]
+    slots = ec2_utils.get_instance_num_gpus(instance_type=instance_type)
+    worker_instance_private_ips = [
+        ec2_utils.get_private_ip(instance_id, region) for instance_id in worker_instance_ids
+    ]
+
+    if ENABLE_IPV6_TESTING:
+        master_ip = master_connection.ipv6_address
+        if not master_ip:
+            raise RuntimeError("IPv6 testing enabled but no IPv6 address found for master node")
+
+        worker_ips = [conn.ipv6_address for conn in efa_ec2_connections[1:]]
+        if not all(worker_ips):
+            raise RuntimeError("IPv6 testing enabled but not all workers have IPv6 addresses")
+
+        hosts_string = f"compute1 slots={slots} "
+        etc_string = f"{master_ip} compute1"
+        compute_counter = 2
+
+        for worker_ip in worker_ips:
+            compute_name = f"compute{compute_counter}"
+            hosts_string += f"\n{compute_name} slots={slots} "
+            etc_string += f"\n{worker_ip} {compute_name}"
+            compute_counter += 1
+
+        run_cmd_on_container(
+            MASTER_CONTAINER_NAME, master_connection, f"""echo "{etc_string}" > /etc/hosts"""
+        )
+
+        run_cmd_on_container(
+            MASTER_CONTAINER_NAME,
+            master_connection,
+            f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
+        )
+    else:
+        # Configure MPI hosts file with IP addresses and slots for worker nodes
+        hosts_string = f"localhost slots={slots} "
+        for worker_ip in worker_instance_private_ips:
+            hosts_string += f"\n{worker_ip} slots={slots} "
+
+        run_cmd_on_container(
+            MASTER_CONTAINER_NAME,
+            master_connection,
+            f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
+        )
+
+
+def _setup_worker_efa_ssh_config(connection, master_pub_key):
+    """
+    Set up SSH server config on worker container to allow connections from master.
+    :param connection: Fabric Connection object
+    :param master_pub_key: str Master node public SSH key to allow password-less SSH access
+    """
+    # Force SSH Daemon to use port 2022, since port 22 is already in use by the host instance
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME, connection, """echo "Port 2022" >> /etc/ssh/sshd_config"""
+    )
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*"
+    )
+    # When running container in --network=host, the container hostname changes, requiring
+    # a new SSH key to be generated.
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME,
+        connection,
+        f"""ssh-keygen -t rsa -f $HOME/.ssh/{WORKER_SSH_KEY_NAME} -N "" """,
+    )
+    # Add both self and master public keys to authorized keys to allow password-less access to
+    # this container from authorized hosts.
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME,
+        connection,
+        f"cp $HOME/.ssh/{WORKER_SSH_KEY_NAME}.pub $HOME/.ssh/authorized_keys",
+    )
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME,
+        connection,
+        f"""echo "{master_pub_key}" >> $HOME/.ssh/authorized_keys""",
+    )
+    # Check if ssh agent is running or not, and if not, run it.
+    run_cmd_on_container(
+        WORKER_CONTAINER_NAME,
+        connection,
+        f"eval `ssh-agent -s` && ssh-add $HOME/.ssh/{WORKER_SSH_KEY_NAME}",
+    )
+    # Start SSH service which uses configurations from /etc/ssh/sshd_config
+    run_cmd_on_container(WORKER_CONTAINER_NAME, connection, "service ssh start")
+    # Check status of SSH service, and fail test-setup if service doesn't run correctly.
+    ssh_status = run_cmd_on_container(
+        WORKER_CONTAINER_NAME, connection, "service ssh status", warn=True
+    )
+    if ssh_status.failed:
+        raise RuntimeError("Failed to setup SSH Daemon on worker node")

From 1783103b409f34d593ee11a9690919396bf03767 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 12:33:53 -0700
Subject: [PATCH 05/33] add v2 vllm EC2 tests

---
 test/v2/ec2/vllm/__init__.py                  |   0
 test/v2/ec2/vllm/head_node_setup.sh           |  37 ++
 .../vllm/run_vllm_benchmark_single_node.sh    |  52 +++
 test/v2/ec2/vllm/run_vllm_on_arm64.sh         | 113 +++++
 test/v2/ec2/vllm/test_ec2.py                  | 423 ++++++++++++++++++
 test/v2/ec2/vllm/worker_node_setup.sh         |  25 ++
 6 files changed, 650 insertions(+)
 create mode 100644 test/v2/ec2/vllm/__init__.py
 create mode 100644 test/v2/ec2/vllm/head_node_setup.sh
 create mode 100644 test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh
 create mode 100644 test/v2/ec2/vllm/run_vllm_on_arm64.sh
 create mode 100644 test/v2/ec2/vllm/test_ec2.py
 create mode 100644 test/v2/ec2/vllm/worker_node_setup.sh

diff --git a/test/v2/ec2/vllm/__init__.py b/test/v2/ec2/vllm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/v2/ec2/vllm/head_node_setup.sh b/test/v2/ec2/vllm/head_node_setup.sh
new file mode 100644
index 000000000000..1c7bd4751824
--- /dev/null
+++ b/test/v2/ec2/vllm/head_node_setup.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Usage: ./head_node_setup.sh <image_uri> <hf_token> <head_ip>
+set -e
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+IMAGE_URI=$1
+HF_TOKEN=$2
+HEAD_IP=$3
+CONTAINER_NAME=$4
+
+log "Starting head node setup..."
+log "Image URI: $IMAGE_URI"
+log "Head IP: $HEAD_IP"
+
+# Start head node in tmux session and capture container ID
+tmux new-session -d -s ray_head "docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name $CONTAINER_NAME \
+    --shm-size 10.24g \
+    --gpus all \
+    -v /fsx/.cache/huggingface:/root/.cache/huggingface \
+    -e VLLM_HOST_IP=$HEAD_IP \
+    -e HF_TOKEN=$HF_TOKEN \
+    -e FI_PROVIDER=efa \
+    -e FI_EFA_USE_DEVICE_RDMA=1 \
+    --device=/dev/infiniband/ \
+    --ulimit memlock=-1:-1 \
+    -p 8000:8000 \
+    $IMAGE_URI -c 'ray start --block --head --port=6379'"
+
+log "Head node started in container: ${CONTAINER_NAME}"
+
diff --git a/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh b/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh
new file mode 100644
index 000000000000..1d6c7c0ef999
--- /dev/null
+++ b/test/v2/ec2/vllm/run_vllm_benchmark_single_node.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+DLC_IMAGE=$1
+HF_TOKEN=$2
+MODEL_NAME=$3
+
+# Run vLLM using Official Docker image from https://docs.vllm.ai/en/latest/deployment/docker.html 
+# Here is the https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
+tmux new-session -d -s single_node "docker run --runtime nvidia --gpus all \
+    -v /fsx/.cache/huggingface:/root/.cache/huggingface \
+    -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    -e "NCCL_DEBUG=TRACE" \
+    -p 8000:8000 \
+    --ipc=host \
+    $DLC_IMAGE \
+    --model $MODEL_NAME \
+    --tensor-parallel-size 8"
+
+sleep 1500
+
+source vllm_env/bin/activate
+
+# Example - Online Benchmark: https://github.com/vllm-project/vllm/tree/main/benchmarks#example---online-benchmark
+python3 /fsx/vllm-dlc/vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model $MODEL_NAME \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path /fsx/vllm-dlc/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 1000
+
+# ============ Serving Benchmark Result ============
+# Successful requests:                     1000      
+# Benchmark duration (s):                  82.67     
+# Total input tokens:                      215196    
+# Total generated tokens:                  185671    
+# Request throughput (req/s):              12.10     
+# Output token throughput (tok/s):         2245.92   
+# Total Token throughput (tok/s):          4848.99   
+# ---------------Time to First Token----------------
+# Mean TTFT (ms):                          25037.89  
+# Median TTFT (ms):                        22099.12  
+# P99 TTFT (ms):                           58100.87  
+# -----Time per Output Token (excl. 1st token)------
+# Mean TPOT (ms):                          98.10     
+# Median TPOT (ms):                        92.09     
+# P99 TPOT (ms):                           256.34    
+# ---------------Inter-token Latency----------------
+# Mean ITL (ms):                           84.56     
+# Median ITL (ms):                         63.78     
+# P99 ITL (ms):                            253.97    
+# ==================================================
diff --git a/test/v2/ec2/vllm/run_vllm_on_arm64.sh b/test/v2/ec2/vllm/run_vllm_on_arm64.sh
new file mode 100644
index 000000000000..d59ecc62f4aa
--- /dev/null
+++ b/test/v2/ec2/vllm/run_vllm_on_arm64.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+set -e
+
+DLC_IMAGE=$1
+HF_TOKEN=$2
+
+if [ -z "$DLC_IMAGE" ] || [ -z "$HF_TOKEN" ]; then
+    echo "Usage: $0 <dlc-image> <hugging-face-token>"
+    exit 1
+fi
+
+MODEL_NAME="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+CONTAINER_NAME="vllm-arm64-dlc"
+PORT=8000
+
+wait_for_api() {
+    local max_attempts=60
+    local attempt=1
+    
+    echo "Waiting for VLLM API to be ready..."
+    while ! curl -s http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "prompt": "What is vllm?",
+            "max_tokens": 30
+            }' > /dev/null; do
+        if [ $attempt -ge $max_attempts ]; then
+            echo "Error: API failed to start after $max_attempts attempts"
+            docker logs ${CONTAINER_NAME}
+            exit 1
+        fi
+        sleep 5
+        ((attempt++))
+    done
+    echo "API is ready!"
+}
+
+cleanup() {
+    echo "Cleaning up containers..."
+    docker stop ${CONTAINER_NAME} 2>/dev/null || true
+    docker rm ${CONTAINER_NAME} 2>/dev/null || true
+}
+
+handle_error() {
+    echo "Error occurred on line $1"
+    cleanup
+    exit 1
+}
+
+trap cleanup EXIT
+trap 'handle_error $LINENO' ERR
+
+echo "####################### RUNNING INFERENCE CHECK ########################################"
+
+docker run --rm \
+    -v /fsx/vllm-dlc/vllm:/vllm \
+    --entrypoint /bin/bash \
+    -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    -e "VLLM_WORKER_MULTIPROC_METHOD=spawn" \
+    -e "VLLM_USE_V1=0" \
+    -v /fsx/.cache/huggingface:/root/.cache/huggingface \
+    --gpus=all \
+    $DLC_IMAGE \
+    -c "python3 /vllm/examples/offline_inference/basic/generate.py \
+        --model ${MODEL_NAME} \
+        --dtype float16 \
+        --tensor-parallel-size 1 \
+        --max-model-len 2048"
+
+echo "####################### Starting VLLM server ##########################################"
+
+docker run -d \
+    -v /fsx/vllm-dlc/vllm:/vllm \
+    --name ${CONTAINER_NAME} \
+    -p ${PORT}:8000 \
+    --entrypoint /bin/bash \
+    -e "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    -e "VLLM_WORKER_MULTIPROC_METHOD=spawn" \
+    -e "VLLM_USE_V1=0" \
+    -v /fsx/.cache/huggingface:/root/.cache/huggingface \
+    --gpus=all \
+    $DLC_IMAGE \
+    -c "vllm serve ${MODEL_NAME} \
+        --dtype float16 \
+        --gpu-memory-utilization 0.7 \
+        --max-model-len 6000 \
+        --enforce-eager \
+        --reasoning-parser deepseek_r1"
+
+wait_for_api
+docker logs "${CONTAINER_NAME}"
+
+echo "####################### API TESTING ###########################"
+
+curl -s http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "prompt": "What is AWS Deep learning container?",
+            "max_tokens": 50
+            }'
+
+echo "####################### TESTING TOOL CALLS (OPEN AI API) ###########################"
+
+python -m venv .venv
+source .venv/bin/activate  
+
+pip install "openai>=1.0.0"
+python3 /fsx/vllm-dlc/vllm/examples/online_serving/openai_chat_completion_with_reasoning.py
+deactivate
+
+echo "####################### Testing completed successfully ###########################"
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
new file mode 100644
index 000000000000..ba64ee3fcc25
--- /dev/null
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -0,0 +1,423 @@
+import threading
+import boto3
+import time, json
+from botocore.exceptions import ClientError
+from fabric import Connection
+
+from infra.test_infra.ec2.utils import (
+    get_account_id_from_image_uri,
+    login_to_ecr_registry,
+    get_ec2_client,
+    install_python_in_instance,
+)
+
+from infra.test_infra.test_infra_utils import create_logger
+from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
+from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources, TEST_ID
+from test.v2.ec2.efa.test_efa import (
+    _setup_multinode_efa_instances,
+    EFA_SANITY_TEST_CMD,
+    MASTER_CONTAINER_NAME,
+    HOSTS_FILE_LOCATION,
+    EFA_INTEGRATION_TEST_CMD,
+    DEFAULT_EFA_TIMEOUT,
+    get_vllm_container_name,
+)
+from test.test_utils import run_cmd_on_container
+
+MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+DEFAULT_REGION = "us-west-2"
+LOGGER = create_logger(__name__)
+
+
+def setup_env(connection):
+    """Setup Python environment on a node"""
+    setup_command = """
+    python3 -m venv vllm_env && \
+    source vllm_env/bin/activate && \
+    pip install --upgrade pip setuptools wheel && \
+    pip install numpy torch tqdm aiohttp pandas datasets pillow ray vllm==0.10.0 && \
+    pip install "transformers<4.54.0"
+    """
+    connection.run(setup_command, shell=True)
+
+
+def create_benchmark_command() -> str:
+    """Create command for running benchmark"""
+    return f"""
+    vllm bench serve \
+    --model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
+    --backend vllm \
+    --base-url "http://localhost:8000" \
+    --endpoint '/v1/completions' \
+    --dataset-name sharegpt \
+    --dataset-path /fsx/vllm-dlc/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --num-prompts 1000
+    """
+
+
+def get_secret_hf_token():
+    secret_name = "test/hf_token"
+    region_name = "us-west-2"
+
+    session = boto3.session.Session()
+    client = session.client(service_name="secretsmanager", region_name=region_name)
+    try:
+        get_secret_value_response = client.get_secret_value(SecretId=secret_name)
+    except ClientError as e:
+        raise e
+
+    response = json.loads(get_secret_value_response["SecretString"])
+    return response
+
+
+def wait_for_container_ready(connection, container_name, timeout: int = 1000) -> bool:
+    """
+    Wait for container and model to be ready by checking logs and endpoint
+    Returns True if container and model are ready, False if timeout
+    """
+    start_time = time.time()
+    model_ready = False
+
+    while time.time() - start_time < timeout:
+        if not model_ready:
+            try:
+                curl_cmd = """
+                curl -s http://localhost:8000/v1/completions \
+                -H "Content-Type: application/json" \
+                -d '{
+                    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+                    "prompt": "Hello",
+                    "max_tokens": 10
+                }'
+                """
+                result = connection.run(curl_cmd, hide=False)
+                if result.ok:
+                    print("Model endpoint is responding")
+                    print("\n=== Complete vLLM Server Log ===")
+                    connection.run(f"docker exec {container_name} cat vllm.log", hide=False)
+                    print("=== End of Log ===\n")
+                    model_ready = True
+                    return True
+            except Exception:
+                pass
+    return False
+
+
+def setup_docker_image(conn, image_uri):
+    account_id = get_account_id_from_image_uri(image_uri)
+    login_to_ecr_registry(conn, account_id, DEFAULT_REGION)
+    print(f"Pulling image: {image_uri}")
+    conn.run(f"docker pull {image_uri}", hide="out")
+
+
+def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_uri):
+    try:
+        # Get HF token
+        response = get_secret_hf_token()
+        hf_token = response.get("HF_TOKEN")
+        if not hf_token:
+            raise Exception("Failed to get HF token")
+
+        for conn in [head_connection, worker_connection]:
+            install_python_in_instance(conn, "3.10")
+            setup_docker_image(conn, image_uri)
+            setup_env(conn)
+
+        head_connection.put("v2/ec2/vllm/head_node_setup.sh", "/home/ec2-user/head_node_setup.sh")
+        worker_connection.put(
+            "v2/ec2/vllm/worker_node_setup.sh", "/home/ec2-user/worker_node_setup.sh"
+        )
+
+        head_connection.run("chmod +x head_node_setup.sh")
+        worker_connection.run("chmod +x worker_node_setup.sh")
+
+        head_ip = head_connection.run("hostname -i").stdout.strip()
+        worker_ip = worker_connection.run("hostname -i").stdout.strip()
+
+        container_name = "ray_head-" + TEST_ID
+        print("Starting head node...")
+        head_connection.run(
+            f"./head_node_setup.sh {image_uri} {hf_token} {head_ip} {container_name}"
+        )
+
+        worker_connection.run(f"./worker_node_setup.sh {image_uri} {head_ip} {worker_ip}")
+
+        # add timer to let container run
+        time.sleep(30)
+
+        commands = ["ray status", "fi_info -p efa"]
+        for command in commands:
+            head_connection.run(f"docker exec -i {container_name} /bin/bash -c '{command}'")
+
+        serve_command = f"vllm serve {MODEL_NAME} --tensor-parallel-size 8 --pipeline-parallel-size 2 --max-num-batched-tokens 16384"
+        head_connection.run(
+            f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'"
+        )
+
+        print("Waiting for model to be ready, approx estimated time to complete is 15 mins...")
+        if not wait_for_container_ready(head_connection, container_name, timeout=2000):
+            raise Exception("Container failed to become ready within timeout period")
+
+        print("Running benchmark...")
+        benchmark_cmd = "source vllm_env/bin/activate &&" + create_benchmark_command()
+        benchmark_result = head_connection.run(benchmark_cmd, timeout=7200)
+
+        return benchmark_result
+
+    except Exception as e:
+        raise Exception(f"Multi-node test execution failed: {str(e)}")
+
+
+def verify_gpu_setup(connection):
+    """
+    Verify GPU setup on the instance before running the test
+
+    Args:
+        connection: Fabric connection object to EC2 instance
+
+    Returns:
+        bool: True if GPU setup is valid, False otherwise
+    """
+    try:
+        # Check nvidia-smi
+        result = connection.run("nvidia-smi", hide=True)
+        if result.failed:
+            print("nvidia-smi check failed")
+            return False
+
+        # Check CUDA availability
+        cuda_check = connection.run("nvidia-smi -L", hide=True)
+        if cuda_check.failed or "GPU" not in cuda_check.stdout:
+            print("No GPUs found")
+            return False
+
+        return True
+
+    except Exception as e:
+        print(f"GPU verification failed: {str(e)}")
+        return False
+
+
+def cleanup_containers(connection):
+    """
+    Cleanup docker containers and images on the instance
+
+    Args:
+        connection: Fabric connection object
+    """
+    try:
+        print("Cleaning up containers and images...")
+        commands = [
+            "docker ps -aq | xargs -r docker stop",
+            "docker ps -aq | xargs -r docker rm",
+        ]
+        for cmd in commands:
+            connection.run(cmd, hide=True, warn=True)
+    except Exception as e:
+        print(f"Cleanup warning: {str(e)}")
+
+
+def run_multi_node_test(head_conn, worker_conn, image_uri):
+    """
+    Run multi-node VLLM benchmark test
+
+    Args:
+        head_conn: Fabric connection object for head node
+        worker_conn: Fabric connection object for worker node
+        image_uri: ECR image URI
+    """
+
+    print("\n=== Starting Multi-Node Test ===")
+    verification_tasks = [(head_conn, "head"), (worker_conn, "worker")]
+    for conn, node_type in verification_tasks:
+        if not verify_gpu_setup(conn):
+            raise Exception(f"GPU setup verification failed for {node_type} node")
+
+    result = test_vllm_benchmark_on_multi_node(head_conn, worker_conn, image_uri)
+    if result.ok:
+        print("Multi-node test completed successfully")
+        return True
+    return False
+
+
+def run_single_node_test(head_conn, image_uri):
+    """
+    Run single-node VLLM benchmark test
+
+    Args:
+        head_conn: Fabric connection object for head node
+        image_uri: ECR image URI
+    """
+    if not verify_gpu_setup(head_conn):
+        raise Exception(f"GPU setup verification failed for head node")
+
+    try:
+        install_python_in_instance(head_conn, python_version="3.10")
+
+        response = get_secret_hf_token()
+        hf_token = response.get("HF_TOKEN")
+
+        setup_docker_image(head_conn, image_uri)
+
+        head_conn.put(
+            "v2/ec2/vllm/run_vllm_on_arm64.sh",
+            "/home/ec2-user/run_vllm_on_arm64.sh",
+        )
+        commands = [
+            "chmod +x /home/ec2-user/run_vllm_on_arm64.sh",
+            f"/home/ec2-user/run_vllm_on_arm64.sh {image_uri} {hf_token}",
+        ]
+
+        result = head_conn.run(
+            "; ".join(commands),
+            hide=False,
+            timeout=7200,
+        )
+
+    except Exception as e:
+        print(f"Test execution failed: {str(e)}")
+        raise
+
+    if result.ok:
+        print("Single-node test completed successfully")
+        return True
+
+
+def test_vllm_on_ec2(resources, image_uri, test_config=None):
+    """
+    Test VLLM on EC2 instances:
+    - For non-arm64: EFA testing, Single node test, and Multi-node test
+    - For arm64: Single node test only
+
+    Args:
+        resources: Dictionary containing instance information and FSx config
+        image_uri: Docker image URI to test
+        test_config: Dictionary containing test configuration (arch_type, framework, etc.)
+    """
+    # Extract arch_type from test_config, fallback to parsing from image_uri if not provided
+    if test_config and "arch_type" in test_config:
+        arch_type = test_config["arch_type"]
+    else:
+        arch_type = "arm64" if "arm64" in image_uri else "x86_64"
+    ec2_cli = None
+    fsx = None
+    ec2_connections = {}
+    test_results = {"efa": None, "single_node": None, "multi_node": None}
+
+    # to test agents
+
+    try:
+        ec2_cli = get_ec2_client(DEFAULT_REGION)
+        fsx = FsxSetup(DEFAULT_REGION)
+
+        for instance_id, key_filename in resources["instances_info"]:
+            try:
+                instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[
+                    "Reservations"
+                ][0]["Instances"][0]
+                public_ip = instance_details.get("PublicIpAddress")
+
+                if not public_ip:
+                    raise Exception(f"No public IP found for instance {instance_id}")
+
+                connection = Connection(
+                    host=public_ip,
+                    user="ec2-user",
+                    connect_kwargs={"key_filename": key_filename},
+                )
+
+                connection.run('echo "Connection test"', hide=True)
+                ec2_connections[instance_id] = connection
+                print(f"Successfully connected to instance {instance_id}")
+
+            except Exception as e:
+                print(f"Failed to connect to instance {instance_id}: {str(e)}")
+                raise
+
+        is_arm64 = "arm64" in image_uri
+        instance_ids = list(ec2_connections.keys())
+        head_conn = ec2_connections[instance_ids[0]]
+
+        if is_arm64:
+            print("\n=== Starting ARM64 Single Node Test ===")
+            test_results["single_node"] = run_single_node_test(head_conn, image_uri)
+            print(
+                f"ARM64 Single node test: {'Passed' if test_results['single_node'] else 'Failed'}"
+            )
+
+        elif len(ec2_connections) >= 2:
+            worker_conn = ec2_connections[instance_ids[1]]
+
+            print("\n=== Starting EFA Tests ===")
+            _setup_multinode_efa_instances(
+                image_uri,
+                resources["instances_info"][:2],
+                [head_conn, worker_conn],
+                "p4d.24xlarge",
+                DEFAULT_REGION,
+                arch_type,
+            )
+
+            # Determine the master container name - vLLM uses unique names
+            master_container_name = get_vllm_container_name("efa", arch_type, "master")
+
+            # Run EFA sanity test
+            run_cmd_on_container(master_container_name, head_conn, EFA_SANITY_TEST_CMD, hide=False)
+
+            # Run EFA integration test
+            run_cmd_on_container(
+                master_container_name,
+                head_conn,
+                f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} 2",
+                hide=False,
+                timeout=DEFAULT_EFA_TIMEOUT,
+            )
+
+            test_results["efa"] = True
+
+            for conn in [head_conn, worker_conn]:
+                cleanup_containers(conn)
+
+            print("EFA tests completed successfully")
+
+            # Run multi-node test
+            test_results["multi_node"] = run_multi_node_test(head_conn, worker_conn, image_uri)
+
+        else:
+            print("\nSkipping multi-node test: insufficient instances")
+
+        print("\n=== Test Summary ===")
+        for test_name, result in test_results.items():
+            if result is not None:
+                print(
+                    f"{test_name.replace('_', ' ').title()} test: {'Passed' if result else 'Failed'}"
+                )
+            else:
+                print(f"{test_name.replace('_', ' ').title()} test: Not Run")
+
+        if is_arm64:
+            if not test_results["single_node"]:
+                raise Exception("Single node test failed for ARM64")
+        elif not any(result for result in test_results.values() if result is not None):
+            raise Exception("All tests failed")
+
+    except Exception as e:
+        print(f"Test execution failed: {str(e)}")
+        raise
+
+    finally:
+        if ec2_cli and fsx:
+            cleanup_timer = threading.Timer(
+                1000, lambda: print("Cleanup timed out, some resources might need manual cleanup")
+            )
+            cleanup_timer.start()
+
+            try:
+                cleanup_resources(ec2_cli, resources, fsx)
+                cleanup_timer.cancel()
+                print("Resources cleaned up successfully")
+            except Exception as e:
+                print(f"Cleanup failed: {str(e)}")
+            finally:
+                cleanup_timer.cancel()
diff --git a/test/v2/ec2/vllm/worker_node_setup.sh b/test/v2/ec2/vllm/worker_node_setup.sh
new file mode 100644
index 000000000000..c6670e882fa8
--- /dev/null
+++ b/test/v2/ec2/vllm/worker_node_setup.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Usage: ./worker_node_setup.sh <image_uri> <head_ip>
+set -e
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+IMAGE_URI=$1
+HEAD_IP=$2
+WORKER_IP=$3
+
+tmux new-session -d -s ray_worker "bash /fsx/vllm-dlc/vllm/examples/online_serving/run_cluster.sh \
+    $IMAGE_URI \
+    $HEAD_IP \
+    --worker \
+    /fsx/.cache/huggingface \
+    -e VLLM_HOST_IP=$WORKER_IP \
+    -e FI_PROVIDER=efa \
+    -e FI_EFA_USE_DEVICE_RDMA=1 \
+    --device=/dev/infiniband/ \
+    --ulimit memlock=-1:-1"
+
+log "Worker node setup complete."
\ No newline at end of file

From e79206b9d34518477d0c2606029b1832bafc114a Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 13:36:32 -0700
Subject: [PATCH 06/33] connect v2 test infra with test

---
 infra/test_infra/ec2/setup.py  | 80 +++++++++++++++++++++++++++-------
 infra/test_infra/ec2/utils.py  |  1 -
 infra/test_infra/eks/setup.py  | 35 ++++++++-------
 infra/test_infra/entrypoint.py | 33 +++++++++-----
 test/test_utils/__init__.py    |  3 ++
 test/v2/ec2/vllm/test_ec2.py   | 16 ++++---
 6 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index ea4a0084a54c..48874c8e6e3b 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from invoke.context import Context
 from codebuild_environment import get_cloned_folder_path
 from infra.test_infra.test_infra_utils import create_logger
@@ -13,6 +12,8 @@ def __init__(self):
         self.region = os.getenv("REGION", "us-west-2")
         self.build_context = os.getenv("BUILD_CONTEXT")
         self.image_uri = None
+        self.framework = None
+        self.arch_type = None
         self.ctx = Context()
 
     def setup(self, params):
@@ -21,12 +22,17 @@ def setup(self, params):
         """
         LOGGER.info(f"Setting up EC2 platform with params: {params}")
 
-        framework = params.get("framework")
+        self.framework = params.get("framework")
+        self.arch_type = params.get("arch_type", "x86_64")
         self.image_uri = params.get("image_uri")
 
-        if framework == "vllm":
-            # vllm requires vLLM-specific setup (FSx + multi-node)
-            LOGGER.info(f"Would call vLLM setup for image: {self.image_uri}")
+        if self.framework == "vllm":
+            # vLLM requires vLLM-specific setup (FSx + multi-node)
+            LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}")
+            from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup
+
+            self.resources = vllm_setup(self.image_uri)
+            LOGGER.info("vLLM setup completed successfully")
         else:
             # standard EC2 setup for other frameworks
             LOGGER.info(f"Would call standard EC2 setup for image: {self.image_uri}")
@@ -34,19 +40,63 @@ def setup(self, params):
 
     def execute_command(self, cmd):
         """
-        Execute a test command with proper environment setup
+        Execute a test command with proper environment setup.
+        Raises exception immediately if command fails.
+        """
+        try:
+            # Set up environment variables for all commands
+            env = {
+                "AWS_REGION": self.region,
+                "BUILD_CONTEXT": self.build_context,
+                "DLC_IMAGE": self.image_uri,
+                "ARCH_TYPE": self.arch_type,
+                "FRAMEWORK": self.framework,
+            }
+
+            # Check if this is a vLLM test command
+            if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd:
+                LOGGER.info(f"Executing vLLM test via direct call: {cmd}")
+                from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2
+
+                # Pass resources and image_uri; test reads config from env vars
+                test_vllm_on_ec2(self.resources, self.image_uri)
+                LOGGER.info(f"Command completed successfully: {cmd}")
+            else:
+                # Standard shell command execution for other cases
+                repo_root = get_cloned_folder_path()
+
+                with self.ctx.cd(repo_root):
+                    LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}")
+                    self.ctx.run(cmd, env=env)
+                    LOGGER.info(f"Command completed successfully: {cmd}")
+                    
+        except Exception as e:
+            raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e
+
+    def cleanup(self):
+        """
+        Cleanup EC2 resources
         """
-        env = {
-            "AWS_REGION": self.region,
-            "BUILD_CONTEXT": self.build_context,
-            "DLC_IMAGE": self.image_uri,
-        }
+        if not self.resources:
+            LOGGER.info("No resources to cleanup")
+            return
 
-        repo_root = get_cloned_folder_path()
+        if self.framework == "vllm":
+            LOGGER.info("Cleaning up vLLM resources")
+            try:
+                from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources
+                from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
+                from infra.test_infra.ec2.utils import get_ec2_client
 
-        with self.ctx.cd(repo_root):
-            LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}")
-            self.ctx.run(cmd, env=env)
+                ec2_client = get_ec2_client(self.region)
+                fsx = FsxSetup(self.region)
+                cleanup_resources(ec2_client, self.resources, fsx)
+                LOGGER.info("vLLM cleanup completed successfully")
+            except Exception as e:
+                LOGGER.error(f"Error during vLLM cleanup: {e}")
+                raise
+        else:
+            LOGGER.info("Standard EC2 cleanup not yet implemented")
 
     def _standard_ec2_setup(self, params):
         """
diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py
index 86e3d0b8a3a8..3159b172db3c 100644
--- a/infra/test_infra/ec2/utils.py
+++ b/infra/test_infra/ec2/utils.py
@@ -1,7 +1,6 @@
 import os
 import time
 import re
-import logging
 import sys
 import uuid
 import copy
diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py
index 9e287e77ddde..a69aa07a5921 100644
--- a/infra/test_infra/eks/setup.py
+++ b/infra/test_infra/eks/setup.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from invoke.context import Context
 from codebuild_environment import get_cloned_folder_path
 from infra.test_infra.test_infra_utils import create_logger
@@ -35,18 +34,24 @@ def setup(self, params):
 
     def execute_command(self, cmd):
         """
-        Execute a test command with proper environment setup
+        Execute a test command with proper environment setup.
+        Raises exception immediately if command fails.
         """
-        env = {
-            "AWS_REGION": self.region,
-            "CLUSTER_NAME": self.cluster_name,
-            "NAMESPACE": self.namespace,
-            "BUILD_CONTEXT": self.build_context,
-            "DLC_IMAGE": self.image_uri,
-        }
-
-        repo_root = get_cloned_folder_path()
-
-        with self.ctx.cd(repo_root):
-            LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}")
-            self.ctx.run(cmd, env=env)
+        try:
+            env = {
+                "AWS_REGION": self.region,
+                "CLUSTER_NAME": self.cluster_name,
+                "NAMESPACE": self.namespace,
+                "BUILD_CONTEXT": self.build_context,
+                "DLC_IMAGE": self.image_uri,
+            }
+
+            repo_root = get_cloned_folder_path()
+
+            with self.ctx.cd(repo_root):
+                LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}")
+                self.ctx.run(cmd, env=env)
+                LOGGER.info(f"Command completed successfully: {cmd}")
+                
+        except Exception as e:
+            raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e
diff --git a/infra/test_infra/entrypoint.py b/infra/test_infra/entrypoint.py
index 7fe859bc087c..9908eeea3511 100644
--- a/infra/test_infra/entrypoint.py
+++ b/infra/test_infra/entrypoint.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from src.config import is_new_test_structure_enabled
 from test.test_utils import get_dlc_images
 from codebuild_environment import get_cloned_folder_path
@@ -58,16 +57,28 @@ def main():
         platform_name = test_config["platform"]
         LOGGER.info(f"Test config {i+1}: platform={platform_name}")
 
-        if test_type == "ec2" and platform_name.startswith("ec2"):
-            LOGGER.info(f"Executing EC2 test for platform: {platform_name}")
-            execute_platform_tests(EC2Platform(), test_config, buildspec_data, image_uri)
-        elif test_type == "eks" and platform_name.startswith("eks"):
-            LOGGER.info(f"Executing EKS test for platform: {platform_name}")
-            execute_platform_tests(EKSPlatform(), test_config, buildspec_data, image_uri)
-        else:
-            LOGGER.info(
-                f"Skipping test config {i+1}: test_type={test_type}, platform={platform_name}"
-            )
+        platform = None
+        try:
+            if test_type == "ec2" and platform_name.startswith("ec2"):
+                LOGGER.info(f"Executing EC2 test for platform: {platform_name}")
+                platform = EC2Platform()
+                execute_platform_tests(platform, test_config, buildspec_data, image_uri)
+            elif test_type == "eks" and platform_name.startswith("eks"):
+                LOGGER.info(f"Executing EKS test for platform: {platform_name}")
+                platform = EKSPlatform()
+                execute_platform_tests(platform, test_config, buildspec_data, image_uri)
+            else:
+                LOGGER.info(
+                    f"Skipping test config {i+1}: test_type={test_type}, platform={platform_name}"
+                )
+        finally:
+            # Cleanup resources if platform supports it
+            if platform is not None and hasattr(platform, "cleanup"):
+                LOGGER.info(f"Cleaning up platform resources for {platform_name}")
+                try:
+                    platform.cleanup()
+                except Exception as e:
+                    LOGGER.error(f"Cleanup failed for {platform_name}: {e}")
 
 
 if __name__ == "__main__":
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 70ddc7d0d564..bc6793da9a1b 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -215,6 +215,9 @@ def get_ami_id_ssm(region_name, parameter_path):
 # Used for referencing tests scripts from container_tests directory (i.e. from ECS cluster)
 CONTAINER_TESTS_PREFIX = os.path.join(os.sep, "test", "bin")
 
+# Used for referencing test scripts in the new v2 test structure
+CONTAINER_TESTS_PREFIX_V2 = os.path.join(os.sep, "test", "v2", "ec2")
+
 # S3 Bucket to use to transfer tests into an EC2 instance
 TEST_TRANSFER_S3_BUCKET = f"s3://dlinfra-tests-transfer-bucket-{ACCOUNT_ID}"
 
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index ba64ee3fcc25..d227902e69d6 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -284,7 +284,7 @@ def run_single_node_test(head_conn, image_uri):
         return True
 
 
-def test_vllm_on_ec2(resources, image_uri, test_config=None):
+def test_vllm_on_ec2(resources, image_uri):
     """
     Test VLLM on EC2 instances:
     - For non-arm64: EFA testing, Single node test, and Multi-node test
@@ -293,13 +293,15 @@ def test_vllm_on_ec2(resources, image_uri, test_config=None):
     Args:
         resources: Dictionary containing instance information and FSx config
         image_uri: Docker image URI to test
-        test_config: Dictionary containing test configuration (arch_type, framework, etc.)
+    
+    Environment Variables:
+        ARCH_TYPE: Architecture type (x86_64 or arm64)
+        AWS_REGION: AWS region
+        FRAMEWORK: Framework being tested (vllm)
     """
-    # Extract arch_type from test_config, fallback to parsing from image_uri if not provided
-    if test_config and "arch_type" in test_config:
-        arch_type = test_config["arch_type"]
-    else:
-        arch_type = "arm64" if "arm64" in image_uri else "x86_64"
+    # Read arch_type from environment variable
+    import os
+    arch_type = os.getenv("ARCH_TYPE", "x86_64")
     ec2_cli = None
     fsx = None
     ec2_connections = {}

From c45457c82925e4c07c5d35fa5c6802eaedc6d71c Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 13:37:15 -0700
Subject: [PATCH 07/33] add framework and arch_type to eks setup as env vars
 for consistency

---
 infra/test_infra/eks/setup.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py
index a69aa07a5921..291af9265925 100644
--- a/infra/test_infra/eks/setup.py
+++ b/infra/test_infra/eks/setup.py
@@ -14,6 +14,8 @@ def __init__(self):
         self.cluster_name = None
         self.namespace = None
         self.image_uri = None
+        self.framework = None
+        self.arch_type = None
         self.ctx = Context()
 
     def setup(self, params):
@@ -22,14 +24,15 @@ def setup(self, params):
         """
         LOGGER.info(f"Setting up EKS platform with params: {params}")
 
-        framework = params.get("framework")
+        self.framework = params.get("framework")
+        self.arch_type = params.get("arch_type", "x86_64")
         cluster_prefix = params.get("cluster")
         self.cluster_name = f"{cluster_prefix}-{self.build_context}"
         self.namespace = params.get("namespace")
         self.image_uri = params.get("image_uri")
 
         LOGGER.info(
-            f"EKS Platform - Framework: {framework}, Cluster: {self.cluster_name}, Namespace: {self.namespace}"
+            f"EKS Platform - Framework: {self.framework}, Cluster: {self.cluster_name}, Namespace: {self.namespace}"
         )
 
     def execute_command(self, cmd):
@@ -44,6 +47,8 @@ def execute_command(self, cmd):
                 "NAMESPACE": self.namespace,
                 "BUILD_CONTEXT": self.build_context,
                 "DLC_IMAGE": self.image_uri,
+                "ARCH_TYPE": self.arch_type,
+                "FRAMEWORK": self.framework,
             }
 
             repo_root = get_cloned_folder_path()

From 7cfbd8e3941d8376d9c63faf3f7464cf3a50d22d Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 13:38:57 -0700
Subject: [PATCH 08/33] test run with new path

---
 dlc_developer_config.toml | 2 +-
 vllm/buildspec.yml        | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 90c179d30484..42d5c76fba0a 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = false
+use_new_test_structure = true
 
 ### On by default
 sanity_tests = true
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index 90dab034e893..a007d441998e 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -52,6 +52,13 @@ images:
         - ec2
         - eks
     tests:
+      - platform: ec2-multi-node-efa
+        params:
+          instance_type: p4d.24xlarge
+          node_count: 2
+        run:
+          - python test/v2/ec2/vllm/test_ec2.py
+        
       - platform: eks
         params:
           cluster: dlc-vllm

From 4ecd5b8902e481c40d81d9df1c016143190de4e9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 14:20:04 -0700
Subject: [PATCH 09/33] dummy commit

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 42d5c76fba0a..6bfe70c08d6d 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = true
+use_new_test_structure = true 
 
 ### On by default
 sanity_tests = true

From bc16bb657dce9435d49710ee9028ee39161d461e Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 16:51:53 -0700
Subject: [PATCH 10/33] fix import and comment out eks path

---
 infra/test_infra/ec2/utils.py |  4 +++-
 vllm/buildspec.yml            | 12 ++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py
index 3159b172db3c..23dee77758f1 100644
--- a/infra/test_infra/ec2/utils.py
+++ b/infra/test_infra/ec2/utils.py
@@ -36,8 +36,10 @@
     login_to_ecr_registry,
     get_account_id_from_image_uri,
     UL_AMI_LIST,
+    DEFAULT_REGION,
+    P4DE_REGION,
+    BENCHMARK_RESULTS_S3_BUCKET,
 )
-from . import DEFAULT_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET
 from infra.test_infra.test_infra_utils import create_logger
 
 EC2_INSTANCE_ROLE_NAME = "ec2TestInstanceRole"
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index a007d441998e..e79fe18ff8f8 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -59,9 +59,9 @@ images:
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
-      - platform: eks
-        params:
-          cluster: dlc-vllm
-          namespace: vllm
-        run:
-          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+      # - platform: eks
+      #   params:
+      #     cluster: dlc-vllm
+      #     namespace: vllm
+      #   run:
+      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file

From 64bd8782b4f5d8374bfb0341c4f8a1d408a6ea24 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Wed, 15 Oct 2025 20:31:39 -0700
Subject: [PATCH 11/33] update vllm test helper

---
 test/testrunner.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test/testrunner.py b/test/testrunner.py
index ef687ee85794..e3f0b73b8e0c 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -290,14 +290,6 @@ def run_vllm_tests(test_type, all_image_list, new_test_structure_enabled):
     try:
         LOGGER.info(f"Running vLLM {test_type.upper()} tests with image: {all_image_list[0]}")
         if new_test_structure_enabled:
-            project_root = os.path.dirname(os.path.dirname(os.getcwd()))
-            spec = importlib.util.spec_from_file_location(
-                "entrypoint",
-                os.path.join(project_root, ".infra", "test_infra", "entrypoint.py"),
-            )
-            entrypoint_module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(entrypoint_module)
-            run_new_tests = entrypoint_module.main
             LOGGER.info("Using new buildspec-based test system")
             run_new_tests()
         else:

From a6cc859f4aa650bc492def88dfaaf138ed82f6bd Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Thu, 16 Oct 2025 12:58:12 -0700
Subject: [PATCH 12/33] update container naming logic

---
 test/v2/ec2/efa/test_efa.py  | 77 +++++++++++++++++++++---------------
 test/v2/ec2/vllm/test_ec2.py |  6 +--
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index dded29bdffcc..1c1abd5b1675 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -42,19 +42,25 @@
 DEFAULT_EFA_TIMEOUT = 300
 
 
-def get_vllm_container_name(test_scenario, arch_type, node_role=None):
+def get_efa_container_name(framework, test_scenario, arch_type, node_role=None):
     """
-    Generate unique container name for vLLM v2 EC2 tests
+    Generate unique container name for EC2 EFA tests
 
     Args:
-        test_scenario: Test scenario (e.g., "efa", "single-node")
+        framework: Framework name (e.g., "vllm", "pytorch", "tensorflow")
+        test_scenario: Test scenario - "efa"
         arch_type: Architecture from buildspec (e.g., "x86_64", "arm64")
-        node_role: For multi-node: "master", "worker-0", etc. (optional)
+        node_role: For multi-node: "master", "worker-0", etc.
 
     Returns:
-        Container name like "vllm-ec2-efa-x86_64-master" or "vllm-ec2-single-node-arm64"
+        Container name like "vllm-ec2-efa-x86_64-master"
     """
-    base_name = f"vllm-ec2-{test_scenario}-{arch_type}"
+    # Try to get framework from environment variable first
+    detected_framework = os.environ.get("FRAMEWORK")
+    if not detected_framework:
+        detected_framework = framework
+    
+    base_name = f"{detected_framework}-ec2-{test_scenario}-{arch_type}"
     return f"{base_name}-{node_role}" if node_role else base_name
 
 
@@ -196,7 +202,7 @@ def _setup_multinode_efa_instances(
         # Use provided arch_type or infer from image as fallback
         if arch_type is None:
             arch_type = "arm64" if "arm64" in image else "x86_64"
-        master_container_name = get_vllm_container_name("efa", arch_type, "master")
+        master_container_name = get_efa_container_name("vllm", "efa", arch_type, "master")
     else:
         master_container_name = MASTER_CONTAINER_NAME
 
@@ -216,7 +222,7 @@ def _setup_multinode_efa_instances(
     for idx, worker_connection in enumerate(efa_ec2_connections[1:]):
         # Determine worker container name
         if "vllm" in image:
-            worker_container_name = get_vllm_container_name("efa", arch_type, f"worker-{idx}")
+            worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}")
         else:
             worker_container_name = WORKER_CONTAINER_NAME
 
@@ -233,22 +239,28 @@ def _setup_multinode_efa_instances(
         build_all_reduce_perf_promises.append(promise)
 
     # Configure master node SSH client-side configurations
-    _setup_master_efa_ssh_config(master_connection)
+    _setup_master_efa_ssh_config(master_connection, master_container_name)
     # Create a hosts file that provides mpi with IP addresses and no. of GPUs in each node
     worker_instance_ids = [instance_id for instance_id, _ in efa_ec2_instances[1:]]
     _create_master_mpi_hosts_file(
-        efa_ec2_connections, worker_instance_ids, ec2_instance_type, region
+        efa_ec2_connections, worker_instance_ids, ec2_instance_type, region, master_container_name
     )
     # Obtain master node SSH public key for future use
     master_pub_key = run_cmd_on_container(
-        MASTER_CONTAINER_NAME, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub"
+        master_container_name, master_connection, f"cat $HOME/.ssh/{MASTER_SSH_KEY_NAME}.pub"
     ).stdout.strip("\n")
 
     # Configure worker node containers
-    for worker_connection in efa_ec2_connections[1:]:
+    for idx, worker_connection in enumerate(efa_ec2_connections[1:]):
+        # Determine worker container name
+        if "vllm" in image:
+            worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}")
+        else:
+            worker_container_name = WORKER_CONTAINER_NAME
+        
         # Configure worker node SSH server-side configurations, launch SSH daemon, and allow
         # password-less SSH access from master to worker nodes.
-        _setup_worker_efa_ssh_config(worker_connection, master_pub_key)
+        _setup_worker_efa_ssh_config(worker_connection, master_pub_key, worker_container_name)
 
     # Wait for all_reduce_perf binaries to be built in all containers
     for promise in build_all_reduce_perf_promises:
@@ -302,18 +314,19 @@ def _setup_container(connection, docker_image, container_name):
         )
 
 
-def _setup_master_efa_ssh_config(connection):
+def _setup_master_efa_ssh_config(connection, master_container_name):
     """
     Set up SSH client config on master container to connect to worker
     :param connection: Fabric Connection object
+    :param master_container_name: str master container name
     """
     run_cmd_on_container(
-        MASTER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*"
+        master_container_name, connection, f"rm -rf $HOME/.ssh/{MASTER_SSH_KEY_NAME}*"
     )
     # When running container in --network=host, the container hostname changes, requiring
     # a new SSH key to be generated.
     run_cmd_on_container(
-        MASTER_CONTAINER_NAME,
+        master_container_name,
         connection,
         f"""ssh-keygen -t rsa -f $HOME/.ssh/{MASTER_SSH_KEY_NAME} -N "" """,
     )
@@ -327,20 +340,21 @@ def _setup_master_efa_ssh_config(connection):
         " Port 2022"
     )
     run_cmd_on_container(
-        MASTER_CONTAINER_NAME,
+        master_container_name,
         connection,
         f"""echo -e "{master_container_ssh_config}" > $HOME/.ssh/config""",
     )
-    run_cmd_on_container(MASTER_CONTAINER_NAME, connection, "chmod -R 600 $HOME/.ssh/*")
+    run_cmd_on_container(master_container_name, connection, "chmod -R 600 $HOME/.ssh/*")
 
 
-def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region):
+def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name):
     """
     Create MPI Hosts file that contains private IP addresses of all hosts used in training job.
     :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections]
     :param worker_instance_ids: list of str worker instance IDs
     :param instance_type: str EC2 Instance Type being used
     :param region: str region name in which test is run
+    :param master_container_name: str master container name
     """
     master_connection = efa_ec2_connections[0]
     slots = ec2_utils.get_instance_num_gpus(instance_type=instance_type)
@@ -368,11 +382,11 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
             compute_counter += 1
 
         run_cmd_on_container(
-            MASTER_CONTAINER_NAME, master_connection, f"""echo "{etc_string}" > /etc/hosts"""
+            master_container_name, master_connection, f"""echo "{etc_string}" > /etc/hosts"""
         )
 
         run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
+            master_container_name,
             master_connection,
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
@@ -383,55 +397,56 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
             hosts_string += f"\n{worker_ip} slots={slots} "
 
         run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
+            master_container_name,
             master_connection,
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
 
 
-def _setup_worker_efa_ssh_config(connection, master_pub_key):
+def _setup_worker_efa_ssh_config(connection, master_pub_key, worker_container_name):
     """
     Set up SSH server config on worker container to allow connections from master.
     :param connection: Fabric Connection object
     :param master_pub_key: str Master node public SSH key to allow password-less SSH access
+    :param worker_container_name: str worker container name
     """
     # Force SSH Daemon to use port 2022, since port 22 is already in use by the host instance
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME, connection, """echo "Port 2022" >> /etc/ssh/sshd_config"""
+        worker_container_name, connection, """echo "Port 2022" >> /etc/ssh/sshd_config"""
     )
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*"
+        worker_container_name, connection, f"rm -rf $HOME/.ssh/{WORKER_SSH_KEY_NAME}*"
     )
     # When running container in --network=host, the container hostname changes, requiring
     # a new SSH key to be generated.
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME,
+        worker_container_name,
         connection,
         f"""ssh-keygen -t rsa -f $HOME/.ssh/{WORKER_SSH_KEY_NAME} -N "" """,
     )
     # Add both self and master public keys to authorized keys to allow password-less access to
     # this container from authorized hosts.
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME,
+        worker_container_name,
         connection,
         f"cp $HOME/.ssh/{WORKER_SSH_KEY_NAME}.pub $HOME/.ssh/authorized_keys",
     )
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME,
+        worker_container_name,
         connection,
         f"""echo "{master_pub_key}" >> $HOME/.ssh/authorized_keys""",
     )
     # Check if ssh agent is running or not, and if not, run it.
     run_cmd_on_container(
-        WORKER_CONTAINER_NAME,
+        worker_container_name,
         connection,
         f"eval `ssh-agent -s` && ssh-add $HOME/.ssh/{WORKER_SSH_KEY_NAME}",
     )
     # Start SSH service which uses configurations from /etc/ssh/sshd_config
-    run_cmd_on_container(WORKER_CONTAINER_NAME, connection, "service ssh start")
+    run_cmd_on_container(worker_container_name, connection, "service ssh start")
     # Check status of SSH service, and fail test-setup if service doesn't run correctly.
     ssh_status = run_cmd_on_container(
-        WORKER_CONTAINER_NAME, connection, "service ssh status", warn=True
+        worker_container_name, connection, "service ssh status", warn=True
     )
     if ssh_status.failed:
         raise RuntimeError("Failed to setup SSH Daemon on worker node")
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index d227902e69d6..2f5b1895e854 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -21,7 +21,7 @@
     HOSTS_FILE_LOCATION,
     EFA_INTEGRATION_TEST_CMD,
     DEFAULT_EFA_TIMEOUT,
-    get_vllm_container_name,
+    get_efa_container_name,
 )
 from test.test_utils import run_cmd_on_container
 
@@ -361,8 +361,8 @@ def test_vllm_on_ec2(resources, image_uri):
                 arch_type,
             )
 
-            # Determine the master container name - vLLM uses unique names
-            master_container_name = get_vllm_container_name("efa", arch_type, "master")
+            # Determine the master container name
+            master_container_name = get_efa_container_name("vllm", "efa", arch_type, "master")
 
             # Run EFA sanity test
             run_cmd_on_container(master_container_name, head_conn, EFA_SANITY_TEST_CMD, hide=False)

From feb3a96f0bf74b30b7687f9cac5cdc0a4f541534 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Thu, 16 Oct 2025 14:48:22 -0700
Subject: [PATCH 13/33] move missing efa test files

---
 test/v2/ec2/efa/build_all_reduce_perf.sh | 20 ++++++++++++++
 test/v2/ec2/efa/efa_test.sh              | 33 ++++++++++++++++++++++++
 test/v2/ec2/efa/testEFASanity            |  2 +-
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 test/v2/ec2/efa/build_all_reduce_perf.sh
 create mode 100644 test/v2/ec2/efa/efa_test.sh

diff --git a/test/v2/ec2/efa/build_all_reduce_perf.sh b/test/v2/ec2/efa/build_all_reduce_perf.sh
new file mode 100644
index 000000000000..70f0cfecaaed
--- /dev/null
+++ b/test/v2/ec2/efa/build_all_reduce_perf.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# For dockerfiles of PyTorch >= 2.0, CUDA_HOME is already set as an env, and is configured as /opt/conda
+python -c "import torch; from packaging.version import Version; assert Version(torch.__version__) >= Version('2.0')"
+TORCH_VERSION_2x=$?
+if [ $TORCH_VERSION_2x -ne 0 ]; then
+  CUDA_HOME=/usr/local/cuda
+fi
+
+set -e
+
+echo "Building all_reduce_perf from nccl-tests"
+cd /tmp/
+rm -rf nccl-tests/
+git clone https://github.com/NVIDIA/nccl-tests.git
+cd nccl-tests/
+make MPI=1 MPI_HOME=/opt/amazon/openmpi NCCL_HOME=/usr/local CUDA_HOME=${CUDA_HOME}
+cp build/all_reduce_perf /all_reduce_perf
+cd /tmp/
+rm -rf nccl-tests/
\ No newline at end of file
diff --git a/test/v2/ec2/efa/efa_test.sh b/test/v2/ec2/efa/efa_test.sh
new file mode 100644
index 000000000000..2a960e6023d7
--- /dev/null
+++ b/test/v2/ec2/efa/efa_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+set -ex
+
+# Script to run fi_pingpong locally over EFA to test connectivity.
+
+if ! command -v fi_info >/dev/null 2>&1 || ! command -v fi_pingpong >/dev/null 2>&1; then
+        echo "Error: required libfabric binaries not found."
+        exit 1
+fi
+
+if ! fi_info -p efa >/dev/null 2>&1; then
+        echo "Error: EFA libfabric provider not detected." >&2
+        exit 1
+fi
+
+echo "Starting server..."
+FI_EFA_ENABLE_SHM_TRANSFER=0 fi_pingpong -e rdm -p efa >/dev/null 2>&1 &
+sleep 0.5
+
+echo "Starting client..."
+FI_EFA_ENABLE_SHM_TRANSFER=0 timeout 8 fi_pingpong -e rdm -p efa localhost
+ret=$?
+if [ $ret -ne 0 ]; then
+        if [ $ret -eq 124 ]; then
+                echo "Error: fi_pingpong test timed out." >&2
+        else
+                echo "Error: fi_pingpong test returned $ret." >&2
+        fi
+fi
+kill %1
+exit $ret
diff --git a/test/v2/ec2/efa/testEFASanity b/test/v2/ec2/efa/testEFASanity
index 1f350628c668..051f77351a65 100644
--- a/test/v2/ec2/efa/testEFASanity
+++ b/test/v2/ec2/efa/testEFASanity
@@ -18,7 +18,7 @@ apt-get update && apt-get install -y kmod
 lsmod | grep ib_uverbs
 
 # ensure that the security group created is configured correctly
-/test/bin/efa/efa_test.sh
+/test/v2/ec2/efa/efa_test.sh
 
 # Queries local RDMA devices
 ibv_devinfo

From 35f0d3e974c4ba803c87f52ccbc44a6d97fd08e9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Thu, 16 Oct 2025 16:42:29 -0700
Subject: [PATCH 14/33] make container path and instance path consistent

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 4 ++--
 test/v2/ec2/efa/test_efa.py            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 5138ce4b20b9..0528d0ab9d05 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -39,8 +39,8 @@
 
 # V2 test path constants
 V2_LOCAL_TEST_PATH = "test/v2"
-V2_INSTANCE_PATH = "$HOME/test_v2"
-V2_CONTAINER_PATH = "/test_v2"
+V2_INSTANCE_PATH = "$HOME/test/v2"
+V2_CONTAINER_PATH = "/test/v2"
 
 TEST_ID = str(uuid.uuid4())
 
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 1c1abd5b1675..272195b6e355 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -305,12 +305,12 @@ def _setup_container(connection, docker_image, container_name):
     if "vllm" in docker_image:
         connection.run(
             f"docker run --entrypoint=/bin/bash -e CUDA_HOME=/usr/local/cuda --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
-            f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image}"
+            f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image}"
         )
     else:
         connection.run(
             f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
-            f"{docker_all_devices_arg} -v $HOME/test_v2:/test -v /dev/shm:/dev/shm {docker_image} bash"
+            f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash"
         )
 
 

From ae024f1284a538b09874a0fa3bb5b73434bbcbc3 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Thu, 16 Oct 2025 18:02:47 -0700
Subject: [PATCH 15/33] move cleanup logic to infra dir

---
 infra/test_infra/ec2/setup.py | 11 +++++++++++
 test/v2/ec2/vllm/test_ec2.py  | 21 ++-------------------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index 48874c8e6e3b..f7f6d00aea71 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -1,4 +1,5 @@
 import os
+import threading
 from invoke.context import Context
 from codebuild_environment import get_cloned_folder_path
 from infra.test_infra.test_infra_utils import create_logger
@@ -83,6 +84,13 @@ def cleanup(self):
 
         if self.framework == "vllm":
             LOGGER.info("Cleaning up vLLM resources")
+            
+            cleanup_timer = threading.Timer(
+                1000, 
+                lambda: LOGGER.warning("Cleanup timed out, some resources might need manual cleanup")
+            )
+            cleanup_timer.start()
+            
             try:
                 from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources
                 from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
@@ -91,10 +99,13 @@ def cleanup(self):
                 ec2_client = get_ec2_client(self.region)
                 fsx = FsxSetup(self.region)
                 cleanup_resources(ec2_client, self.resources, fsx)
+                cleanup_timer.cancel()
                 LOGGER.info("vLLM cleanup completed successfully")
             except Exception as e:
                 LOGGER.error(f"Error during vLLM cleanup: {e}")
                 raise
+            finally:
+                cleanup_timer.cancel()
         else:
             LOGGER.info("Standard EC2 cleanup not yet implemented")
 
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index 2f5b1895e854..aef003491df0 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -1,4 +1,3 @@
-import threading
 import boto3
 import time, json
 from botocore.exceptions import ClientError
@@ -7,13 +6,13 @@
 from infra.test_infra.ec2.utils import (
     get_account_id_from_image_uri,
     login_to_ecr_registry,
-    get_ec2_client,
     install_python_in_instance,
+    get_ec2_client,
 )
 
 from infra.test_infra.test_infra_utils import create_logger
 from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
-from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources, TEST_ID
+from infra.test_infra.ec2.vllm.setup_ec2 import TEST_ID
 from test.v2.ec2.efa.test_efa import (
     _setup_multinode_efa_instances,
     EFA_SANITY_TEST_CMD,
@@ -407,19 +406,3 @@ def test_vllm_on_ec2(resources, image_uri):
     except Exception as e:
         print(f"Test execution failed: {str(e)}")
         raise
-
-    finally:
-        if ec2_cli and fsx:
-            cleanup_timer = threading.Timer(
-                1000, lambda: print("Cleanup timed out, some resources might need manual cleanup")
-            )
-            cleanup_timer.start()
-
-            try:
-                cleanup_resources(ec2_cli, resources, fsx)
-                cleanup_timer.cancel()
-                print("Resources cleaned up successfully")
-            except Exception as e:
-                print(f"Cleanup failed: {str(e)}")
-            finally:
-                cleanup_timer.cancel()

From bffcebdf8f2ac7105a36777d6c46e564c885d711 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Fri, 17 Oct 2025 14:16:41 -0700
Subject: [PATCH 16/33] add debugging, fix resource passing & formatting

---
 infra/test_infra/ec2/setup.py | 13 ++++---
 infra/test_infra/ec2/utils.py |  1 +
 infra/test_infra/eks/setup.py |  2 +-
 test/v2/ec2/efa/test_efa.py   | 26 ++++++++++---
 test/v2/ec2/vllm/test_ec2.py  | 73 +++++++++++++++++++++++++----------
 5 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index f7f6d00aea71..102563cd2519 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -55,6 +55,7 @@ def execute_command(self, cmd):
             }
 
             # Check if this is a vLLM test command
+            # TODO: check if there is a better way to handle this
             if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd:
                 LOGGER.info(f"Executing vLLM test via direct call: {cmd}")
                 from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2
@@ -70,7 +71,7 @@ def execute_command(self, cmd):
                     LOGGER.info(f"Executing command from {repo_root} with EC2 environment: {cmd}")
                     self.ctx.run(cmd, env=env)
                     LOGGER.info(f"Command completed successfully: {cmd}")
-                    
+
         except Exception as e:
             raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e
 
@@ -84,13 +85,15 @@ def cleanup(self):
 
         if self.framework == "vllm":
             LOGGER.info("Cleaning up vLLM resources")
-            
+
             cleanup_timer = threading.Timer(
-                1000, 
-                lambda: LOGGER.warning("Cleanup timed out, some resources might need manual cleanup")
+                1000,
+                lambda: LOGGER.warning(
+                    "Cleanup timed out, some resources might need manual cleanup"
+                ),
             )
             cleanup_timer.start()
-            
+
             try:
                 from infra.test_infra.ec2.vllm.setup_ec2 import cleanup_resources
                 from infra.test_infra.ec2.vllm.fsx_utils import FsxSetup
diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py
index 23dee77758f1..ec3204d46ec0 100644
--- a/infra/test_infra/ec2/utils.py
+++ b/infra/test_infra/ec2/utils.py
@@ -57,6 +57,7 @@
 
 LOGGER = create_logger(__name__)
 
+
 def filter_only_multi_gpu(instance_type_list):
     filtered_list = [
         instance_type
diff --git a/infra/test_infra/eks/setup.py b/infra/test_infra/eks/setup.py
index 291af9265925..bfffe5df3e07 100644
--- a/infra/test_infra/eks/setup.py
+++ b/infra/test_infra/eks/setup.py
@@ -57,6 +57,6 @@ def execute_command(self, cmd):
                 LOGGER.info(f"Executing command from {repo_root} with EKS environment: {cmd}")
                 self.ctx.run(cmd, env=env)
                 LOGGER.info(f"Command completed successfully: {cmd}")
-                
+
         except Exception as e:
             raise RuntimeError(f"Failed to execute command: {cmd}\nError: {str(e)}") from e
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 272195b6e355..251d2ff88337 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -59,7 +59,7 @@ def get_efa_container_name(framework, test_scenario, arch_type, node_role=None):
     detected_framework = os.environ.get("FRAMEWORK")
     if not detected_framework:
         detected_framework = framework
-    
+
     base_name = f"{detected_framework}-ec2-{test_scenario}-{arch_type}"
     return f"{base_name}-{node_role}" if node_role else base_name
 
@@ -209,6 +209,16 @@ def _setup_multinode_efa_instances(
     build_all_reduce_perf_promises = []
     # Run container
     _setup_container(master_connection, image, master_container_name)
+
+    # Verify files are visible inside container
+    print(f"Verifying files inside {master_container_name} container...")
+    run_cmd_on_container(
+        master_container_name,
+        master_connection,
+        "ls -la /test/v2/ec2/efa/",
+        hide=False,
+    )
+
     # Build all_reduce_perf binary using nccl-tests
     promise = run_cmd_on_container(
         master_container_name,
@@ -222,7 +232,9 @@ def _setup_multinode_efa_instances(
     for idx, worker_connection in enumerate(efa_ec2_connections[1:]):
         # Determine worker container name
         if "vllm" in image:
-            worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}")
+            worker_container_name = get_efa_container_name(
+                "vllm", "efa", arch_type, f"worker-{idx}"
+            )
         else:
             worker_container_name = WORKER_CONTAINER_NAME
 
@@ -254,10 +266,12 @@ def _setup_multinode_efa_instances(
     for idx, worker_connection in enumerate(efa_ec2_connections[1:]):
         # Determine worker container name
         if "vllm" in image:
-            worker_container_name = get_efa_container_name("vllm", "efa", arch_type, f"worker-{idx}")
+            worker_container_name = get_efa_container_name(
+                "vllm", "efa", arch_type, f"worker-{idx}"
+            )
         else:
             worker_container_name = WORKER_CONTAINER_NAME
-        
+
         # Configure worker node SSH server-side configurations, launch SSH daemon, and allow
         # password-less SSH access from master to worker nodes.
         _setup_worker_efa_ssh_config(worker_connection, master_pub_key, worker_container_name)
@@ -347,7 +361,9 @@ def _setup_master_efa_ssh_config(connection, master_container_name):
     run_cmd_on_container(master_container_name, connection, "chmod -R 600 $HOME/.ssh/*")
 
 
-def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name):
+def _create_master_mpi_hosts_file(
+    efa_ec2_connections, worker_instance_ids, instance_type, region, master_container_name
+):
     """
     Create MPI Hosts file that contains private IP addresses of all hosts used in training job.
     :param efa_ec2_connections: List of Fabric Connection objects [master_connection, *worker_connections]
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index aef003491df0..9aa13761f9e3 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -292,7 +292,7 @@ def test_vllm_on_ec2(resources, image_uri):
     Args:
         resources: Dictionary containing instance information and FSx config
         image_uri: Docker image URI to test
-    
+
     Environment Variables:
         ARCH_TYPE: Architecture type (x86_64 or arm64)
         AWS_REGION: AWS region
@@ -300,6 +300,7 @@ def test_vllm_on_ec2(resources, image_uri):
     """
     # Read arch_type from environment variable
     import os
+
     arch_type = os.getenv("ARCH_TYPE", "x86_64")
     ec2_cli = None
     fsx = None
@@ -312,28 +313,47 @@ def test_vllm_on_ec2(resources, image_uri):
         ec2_cli = get_ec2_client(DEFAULT_REGION)
         fsx = FsxSetup(DEFAULT_REGION)
 
-        for instance_id, key_filename in resources["instances_info"]:
-            try:
-                instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[
-                    "Reservations"
-                ][0]["Instances"][0]
-                public_ip = instance_details.get("PublicIpAddress")
-
-                if not public_ip:
-                    raise Exception(f"No public IP found for instance {instance_id}")
-
-                connection = Connection(
-                    host=public_ip,
-                    user="ec2-user",
-                    connect_kwargs={"key_filename": key_filename},
+        # Use existing connections from resources if available, otherwise create new ones
+        if "connections" in resources and resources["connections"]:
+            print("Using existing connections from setup phase")
+            # Use connections that were created during setup_test_artifacts()
+            ec2_connections = {
+                instance_id: conn
+                for (instance_id, _), conn in zip(
+                    resources["instances_info"], resources["connections"]
                 )
-
-                connection.run('echo "Connection test"', hide=True)
-                ec2_connections[instance_id] = connection
-                print(f"Successfully connected to instance {instance_id}")
-
+            }
+        else:
+            print("Creating new connections to instances")
+            for instance_id, key_filename in resources["instances_info"]:
+                try:
+                    instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[
+                        "Reservations"
+                    ][0]["Instances"][0]
+                    public_ip = instance_details.get("PublicIpAddress")
+
+                    if not public_ip:
+                        raise Exception(f"No public IP found for instance {instance_id}")
+
+                    connection = Connection(
+                        host=public_ip,
+                        user="ec2-user",
+                        connect_kwargs={"key_filename": key_filename},
+                    )
+
+                    ec2_connections[instance_id] = connection
+
+                except Exception as e:
+                    print(f"Failed to connect to instance {instance_id}: {str(e)}")
+                    raise
+
+        # Verify all connections are working
+        for instance_id, conn in ec2_connections.items():
+            try:
+                conn.run('echo "Connection test"', hide=True)
+                print(f"Successfully verified connection to instance {instance_id}")
             except Exception as e:
-                print(f"Failed to connect to instance {instance_id}: {str(e)}")
+                print(f"Connection test failed for instance {instance_id}: {str(e)}")
                 raise
 
         is_arm64 = "arm64" in image_uri
@@ -350,6 +370,17 @@ def test_vllm_on_ec2(resources, image_uri):
         elif len(ec2_connections) >= 2:
             worker_conn = ec2_connections[instance_ids[1]]
 
+            # Verify test files exist before starting containers
+            print("\n=== Verifying test files on EC2 instances ===")
+            for conn_id, conn in ec2_connections.items():
+                result = conn.run("ls -la $HOME/test/v2/ec2/efa/", warn=True)
+                if result.failed:
+                    raise Exception(
+                        f"Test files not found at $HOME/test/v2/ec2/efa/ on instance {conn_id}"
+                    )
+                print(f"Instance {conn_id} test files:")
+                print(result.stdout)
+
             print("\n=== Starting EFA Tests ===")
             _setup_multinode_efa_instances(
                 image_uri,

From 1e04c399d6ba6eb0fdf00053a8b21a37d9cf8f86 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 19 Oct 2025 14:33:24 -0700
Subject: [PATCH 17/33] store connection params instead of objects

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 18 +++++++++++++-----
 test/v2/ec2/vllm/test_ec2.py           | 26 ++++++++++++++++----------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 0528d0ab9d05..c4958079ea9d 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -140,6 +140,7 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region):
     ec2_connections = {}
     master_connection = None
     worker_connection = None
+    connection_params = []
 
     for instance in instances:
         instance_id = instance["InstanceId"]
@@ -162,6 +163,14 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region):
             connection.run('echo "Connection test"', hide=True)
             ec2_connections[instance_id] = connection
 
+            # Store connection parameters for later recreation
+            connection_params.append({
+                "instance_id": instance_id,
+                "host": public_ip,
+                "user": "ec2-user",
+                "key_filename": key_filename
+            })
+
             if not master_connection:
                 master_connection = connection
             else:
@@ -204,9 +213,8 @@ def delete_s3_artifact_copy():
     finally:
         delete_s3_artifact_copy()
 
-    if worker_connection:
-        return [master_connection, worker_connection]
-    return [master_connection]
+    # Return connection parameters
+    return connection_params
 
 
 def launch_regular_instances_with_retry(
@@ -342,13 +350,13 @@ def efa_ec2_instances(
                             delete_elastic_ips(elastic_ip_allocation_ids, ec2_client)
                         raise Exception(f"Error allocating elastic IP: {str(e)}")
 
-        connections = setup_test_artifacts(ec2_client, instances, key_filename, region)
+        connection_params = setup_test_artifacts(ec2_client, instances, key_filename, region)
         return_val = {
             "instances": [
                 (instance_info["InstanceId"], key_filename) for instance_info in instances
             ],
             "elastic_ips": elastic_ip_allocation_ids,
-            "connections": connections,
+            "connection_params": connection_params,
         }
         print("Launched EFA Test instances")
         return return_val
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index 9aa13761f9e3..c66376cdcb06 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -313,16 +313,22 @@ def test_vllm_on_ec2(resources, image_uri):
         ec2_cli = get_ec2_client(DEFAULT_REGION)
         fsx = FsxSetup(DEFAULT_REGION)
 
-        # Use existing connections from resources if available, otherwise create new ones
-        if "connections" in resources and resources["connections"]:
-            print("Using existing connections from setup phase")
-            # Use connections that were created during setup_test_artifacts()
-            ec2_connections = {
-                instance_id: conn
-                for (instance_id, _), conn in zip(
-                    resources["instances_info"], resources["connections"]
-                )
-            }
+        # Recreate connections from stored parameters if available, otherwise create new ones
+        if "connection_params" in resources and resources["connection_params"]:
+            print("Recreating connections from stored parameters")
+            # Recreate fresh Connection objects from parameters stored during setup_test_artifacts()
+            for params in resources["connection_params"]:
+                try:
+                    connection = Connection(
+                        host=params["host"],
+                        user=params["user"],
+                        connect_kwargs={"key_filename": params["key_filename"]},
+                    )
+                    ec2_connections[params["instance_id"]] = connection
+                    print(f"Recreated connection to instance {params['instance_id']}")
+                except Exception as e:
+                    print(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}")
+                    raise
         else:
             print("Creating new connections to instances")
             for instance_id, key_filename in resources["instances_info"]:

From 523c335e982b320fa5dd57ae267586a328c8b671 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 19 Oct 2025 19:30:15 -0700
Subject: [PATCH 18/33] fix connections reference

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index c4958079ea9d..88f2ea2bf907 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -601,7 +601,7 @@ def setup(image):
         instance_result = launch_ec2_instances(ec2_cli, image)
         resources["instances_info"] = instance_result["instances"]
         resources["elastic_ips"] = instance_result["elastic_ips"]
-        resources["connections"] = instance_result["connections"]
+        resources["connection_params"] = instance_result["connection_params"]
         print("Waiting 60 seconds for instances to initialize...")
         time.sleep(60)
 

From 4e33e8347f86b796b49bd935d0517b14c7afc691 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Sun, 19 Oct 2025 20:37:56 -0700
Subject: [PATCH 19/33] add debug print

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 88f2ea2bf907..4bc14849d8a5 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -196,6 +196,13 @@ def delete_s3_artifact_copy():
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
+            
+            # Debug: check dir structure
+            print("=== DEBUG: Master instance dir structure ===")
+            print(f"Contents of {V2_INSTANCE_PATH}:")
+            master_connection.run(f"ls -la {V2_INSTANCE_PATH}/")
+            print("=== END DEBUG ===\n")
+            
             master_connection.run(
                 f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
             )
@@ -206,6 +213,13 @@ def delete_s3_artifact_copy():
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
+            
+            # Debug: Check directory structure
+            print("=== DEBUG: Worker instance directory structure ===")
+            print(f"Contents of {V2_INSTANCE_PATH}:")
+            worker_connection.run(f"ls -la {V2_INSTANCE_PATH}/")
+            print("=== END DEBUG ===\n")
+            
             worker_connection.run(
                 f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
             )

From e16cedfe8f35786e9a75a06b78a2e4e40cb537d1 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 11:51:40 -0700
Subject: [PATCH 20/33] update v2 test structure s3 upload & download paths

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 22 +++++++++++-----------
 test/test_utils/__init__.py            | 11 ++++++++---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 4bc14849d8a5..73aa12ace19f 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -39,7 +39,7 @@
 
 # V2 test path constants
 V2_LOCAL_TEST_PATH = "test/v2"
-V2_INSTANCE_PATH = "$HOME/test/v2"
+INSTANCE_TEST_BASE_PATH = "$HOME/test"
 V2_CONTAINER_PATH = "/test/v2"
 
 TEST_ID = str(uuid.uuid4())
@@ -191,37 +191,37 @@ def delete_s3_artifact_copy():
     try:
         # Setup master instance
         if master_connection:
-            master_connection.run(f"rm -rf {V2_INSTANCE_PATH}")
+            master_connection.run(f"rm -rf {INSTANCE_TEST_BASE_PATH}")
             master_connection.run(
-                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
+                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
             
             # Debug: check dir structure
             print("=== DEBUG: Master instance dir structure ===")
-            print(f"Contents of {V2_INSTANCE_PATH}:")
-            master_connection.run(f"ls -la {V2_INSTANCE_PATH}/")
+            print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:")
+            master_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/")
             print("=== END DEBUG ===\n")
             
             master_connection.run(
-                f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
+                f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
             )
 
         if worker_connection:
-            worker_connection.run(f"rm -rf {V2_INSTANCE_PATH}")
+            worker_connection.run(f"rm -rf {INSTANCE_TEST_BASE_PATH}")
             worker_connection.run(
-                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {V2_INSTANCE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
+                f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
             
             # Debug: Check directory structure
             print("=== DEBUG: Worker instance directory structure ===")
-            print(f"Contents of {V2_INSTANCE_PATH}:")
-            worker_connection.run(f"ls -la {V2_INSTANCE_PATH}/")
+            print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:")
+            worker_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/")
             print("=== END DEBUG ===\n")
             
             worker_connection.run(
-                f"mkdir -p {V2_INSTANCE_PATH}/logs && chmod -R +x {V2_INSTANCE_PATH}/*"
+                f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
             )
 
     finally:
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index a908467230bc..54461488a75f 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -1309,9 +1309,14 @@ def upload_tests_to_s3(testname_datetime_suffix):
         raise EnvironmentError("Test is being run from wrong path")
     while os.path.basename(path) != "dlc_tests":
         path = os.path.dirname(path)
-    container_tests_path = os.path.join(path, "container_tests")
-
-    run(f"aws s3 cp --recursive {container_tests_path}/ {s3_test_location}/")
+    
+    # If if new test structure is enabled, upload only v2 directory for new test structure
+    if is_new_test_structure_enabled():
+        v2_path = os.path.join(os.path.dirname(path), "v2")
+        run(f"aws s3 cp --recursive {v2_path}/ {s3_test_location}/v2/")
+    else:
+        container_tests_path = os.path.join(path, "container_tests")
+        run(f"aws s3 cp --recursive {container_tests_path}/ {s3_test_location}/")
     return s3_test_location
 
 

From c903d19a9dc6235189815c9b1eb2a0bebb79fc61 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 12:47:14 -0700
Subject: [PATCH 21/33] update training log path

---
 test/v2/ec2/efa/testEFA | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/v2/ec2/efa/testEFA b/test/v2/ec2/efa/testEFA
index 4b676249d816..75972864c2cd 100644
--- a/test/v2/ec2/efa/testEFA
+++ b/test/v2/ec2/efa/testEFA
@@ -20,7 +20,7 @@ NODES=$(($GPU_COUNT * $NUM_HOSTS))
 
 
 PRETTY_NAME=$(cat /etc/os-release | grep PRETTY_NAME)
-TRAINING_LOG="/test/logs/testEFA.log"
+TRAINING_LOG="/test/v2/logs/testEFA.log"
 
 USE_DEVICE_RDMA_ARG=""
 

From fef320df98b23e110b7622b226a1aa01001b97dc Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 14:27:20 -0700
Subject: [PATCH 22/33] add logging for container names

---
 test/v2/ec2/efa/test_efa.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 251d2ff88337..cb1a8bd2be1f 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -21,6 +21,9 @@
     filter_efa_instance_type,
     filter_efa_only_p4_instance_type,
 )
+from infra.test_infra.test_infra_utils import create_logger
+
+LOGGER = create_logger(__name__)
 
 BUILD_ALL_REDUCE_PERF_CMD = os.path.join(
     CONTAINER_TESTS_PREFIX_V2, "efa", "build_all_reduce_perf.sh"
@@ -206,6 +209,8 @@ def _setup_multinode_efa_instances(
     else:
         master_container_name = MASTER_CONTAINER_NAME
 
+    LOGGER.info(f"Master container name: {master_container_name}")
+
     build_all_reduce_perf_promises = []
     # Run container
     _setup_container(master_connection, image, master_container_name)
@@ -238,6 +243,8 @@ def _setup_multinode_efa_instances(
         else:
             worker_container_name = WORKER_CONTAINER_NAME
 
+        LOGGER.info(f"Worker container name: {worker_container_name}")
+
         # Run container
         _setup_container(worker_connection, image, worker_container_name)
         # Build all_reduce_perf binary using nccl-tests
@@ -326,6 +333,8 @@ def _setup_container(connection, docker_image, container_name):
             f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
             f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash"
         )
+    
+    LOGGER.info(f"Container {container_name} started successfully")
 
 
 def _setup_master_efa_ssh_config(connection, master_container_name):

From 3b893c4901414dab1a3dcd2496915e2e2e5bedd9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 14:56:18 -0700
Subject: [PATCH 23/33] clean up debug logs and make prints logs

---
 infra/test_infra/ec2/vllm/fsx_utils.py | 16 +++---
 infra/test_infra/ec2/vllm/setup_ec2.py | 68 ++++++++++-------------
 test/v2/ec2/efa/test_efa.py            | 18 +++----
 test/v2/ec2/vllm/test_ec2.py           | 75 +++++++++++---------------
 4 files changed, 77 insertions(+), 100 deletions(-)

diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py
index ce27840fff90..a7b1b0dc4446 100644
--- a/infra/test_infra/ec2/vllm/fsx_utils.py
+++ b/infra/test_infra/ec2/vllm/fsx_utils.py
@@ -72,7 +72,7 @@ def delete_fsx_filesystem(self, fsx_id: str):
                 f" --output text"
             ).stdout.strip()
 
-            print(f"Deleted FSx filesystem: {fsx_id}")
+            LOGGER.info(f"Deleted FSx filesystem: {fsx_id}")
 
         except Exception as e:
             LOGGER.error(f"Failed to create FSx filesystem: {e}")
@@ -85,7 +85,7 @@ def wait_for_filesystem(self, filesystem_id: str):
         : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name)
         : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state
         """
-        print(f"Waiting for FSx filesystem {filesystem_id} to be available...")
+        LOGGER.info(f"Waiting for FSx filesystem {filesystem_id} to be available...")
         while True:
             status = run(
                 f"aws fsx describe-file-systems --file-system-id {filesystem_id} "
@@ -97,7 +97,7 @@ def wait_for_filesystem(self, filesystem_id: str):
             elif status in ["FAILED", "DELETING", "DELETED"]:
                 raise Exception(f"FSx filesystem entered {status} state")
 
-            print(f"FSx status: {status}, waiting...")
+            LOGGER.info(f"FSx status: {status}, waiting...")
             time.sleep(30)
 
         # get fs DNS and mount name
@@ -130,12 +130,12 @@ def create_fsx_security_group(self, ec2_cli, vpc_id, group_name, description):
                 VpcId=vpc_id,
             )
             sg_id = response["GroupId"]
-            print(f"Created security group: {sg_id}")
+            LOGGER.info(f"Created security group: {sg_id}")
 
             return sg_id
 
         except ClientError as e:
-            print(f"An error occurred: {e}")
+            LOGGER.info(f"An error occurred: {e}")
             return None
 
     def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids):
@@ -173,12 +173,12 @@ def add_ingress_rules_sg(self, ec2_cli, sg_id, instance_ids):
                     }
                 ],
             )
-            print(
+            LOGGER.info(
                 f"Added inbound rules to FSx security group {sg_id} for instance security groups: {instance_sg_ids}"
             )
 
         except Exception as e:
-            print(f"Error adding ingress rules: {str(e)}")
+            LOGGER.info(f"Error adding ingress rules: {str(e)}")
             raise
 
     def delete_security_group(self, ec2_cli, group_id: str):
@@ -195,7 +195,7 @@ def delete_security_group(self, ec2_cli, group_id: str):
                 GroupId=group_id,
             )
             sg_id = response["GroupId"]
-            print(f"Deleted security group: {sg_id}")
+            LOGGER.info(f"Deleted security group: {sg_id}")
 
         except Exception as e:
             LOGGER.error(f"Failed to delete security group: {e}")
diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 73aa12ace19f..2f08600b4a90 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -176,10 +176,10 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region):
             else:
                 worker_connection = connection
 
-            print(f"Successfully connected to instance {instance_id}")
+            LOGGER.info(f"Successfully connected to instance {instance_id}")
 
         except Exception as e:
-            print(f"Failed to connect to instance {instance_id}: {str(e)}")
+            LOGGER.error(f"Failed to connect to instance {instance_id}: {str(e)}")
             raise
 
     artifact_folder = f"vllm-{TEST_ID}-folder"
@@ -195,13 +195,7 @@ def delete_s3_artifact_copy():
             master_connection.run(
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
-            print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
-            
-            # Debug: check dir structure
-            print("=== DEBUG: Master instance dir structure ===")
-            print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:")
-            master_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/")
-            print("=== END DEBUG ===\n")
+            LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
             
             master_connection.run(
                 f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
@@ -212,13 +206,7 @@ def delete_s3_artifact_copy():
             worker_connection.run(
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
-            print(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
-            
-            # Debug: Check directory structure
-            print("=== DEBUG: Worker instance directory structure ===")
-            print(f"Contents of {INSTANCE_TEST_BASE_PATH}/v2/:")
-            worker_connection.run(f"ls -la {INSTANCE_TEST_BASE_PATH}/v2/")
-            print("=== END DEBUG ===\n")
+            LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
             
             worker_connection.run(
                 f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
@@ -277,7 +265,7 @@ def efa_ec2_instances(
 
     try:
         ec2_key_name = f"{ec2_key_name}-{TEST_ID}"
-        print(f"Creating instance: CI-CD {ec2_key_name}")
+        LOGGER.info(f"Creating instance: CI-CD {ec2_key_name}")
         key_filename = test_utils.generate_ssh_keypair(ec2_client, ec2_key_name)
         volume_name = "/dev/sda1" if ec2_instance_ami in test_utils.UL_AMI_LIST else "/dev/xvda"
 
@@ -329,7 +317,7 @@ def efa_ec2_instances(
         check_system_state(
             master_instance_id, system_status="ok", instance_status="ok", region=region
         )
-        print(f"Master instance {master_instance_id} is ready")
+        LOGGER.info(f"Master instance {master_instance_id} is ready")
         create_name_tags_for_instance(master_instance_id, f"{instance_name_prefix}_master", region)
         if is_efa:
             for i in range(1, len(instances)):
@@ -341,13 +329,13 @@ def efa_ec2_instances(
                 check_system_state(
                     worker_instance_id, system_status="ok", instance_status="ok", region=region
                 )
-                print(f"Worker instance {worker_instance_id} is ready")
+                LOGGER.info(f"Worker instance {worker_instance_id} is ready")
 
             num_efa_interfaces = get_num_efa_interfaces_for_instance_type(
                 ec2_instance_type, region=region
             )
 
-            print(num_efa_interfaces)
+            LOGGER.info(num_efa_interfaces)
 
             if num_efa_interfaces > 1:
                 for instance in instances:
@@ -372,17 +360,17 @@ def efa_ec2_instances(
             "elastic_ips": elastic_ip_allocation_ids,
             "connection_params": connection_params,
         }
-        print("Launched EFA Test instances")
+        LOGGER.info("Launched EFA Test instances")
         return return_val
 
     except Exception as e:
-        print(f"Error in efa_ec2_instances: {str(e)}")
+        LOGGER.error(f"Error in efa_ec2_instances: {str(e)}")
         # Clean up elastic IPs
         if elastic_ip_allocation_ids:
             try:
                 delete_elastic_ips(elastic_ip_allocation_ids, ec2_client)
             except Exception as cleanup_error:
-                print(f"Error cleaning up elastic IPs: {str(cleanup_error)}")
+                LOGGER.error(f"Error cleaning up elastic IPs: {str(cleanup_error)}")
 
         # Clean up instances
         if instances:
@@ -393,7 +381,7 @@ def efa_ec2_instances(
                 waiter = ec2_client.get_waiter("instance_terminated")
                 waiter.wait(InstanceIds=instance_ids)
             except Exception as cleanup_error:
-                print(f"Error terminating instances: {str(cleanup_error)}")
+                LOGGER.error(f"Error terminating instances: {str(cleanup_error)}")
 
         # Clean up key pair
         if key_filename:
@@ -403,7 +391,7 @@ def efa_ec2_instances(
                 if os.path.exists(f"{key_filename}.pub"):
                     os.remove(f"{key_filename}.pub")
             except Exception as cleanup_error:
-                print(f"Error cleaning up key files: {str(cleanup_error)}")
+                LOGGER.error(f"Error cleaning up key files: {str(cleanup_error)}")
 
         raise
 
@@ -437,13 +425,13 @@ def wait_for_instances(instance_ids):
             waiter.wait(InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 100})
             return True
         except WaiterError as e:
-            print(f"Warning: Instance termination waiter timed out: {str(e)}")
+            LOGGER.error(f"Warning: Instance termination waiter timed out: {str(e)}")
             return False
 
     if resources.get("elastic_ips"):
         try:
             delete_elastic_ips(resources["elastic_ips"], ec2_cli)
-            print(f"Deleted elastic IPs: {resources['elastic_ips']}")
+            LOGGER.error(f"Deleted elastic IPs: {resources['elastic_ips']}")
         except Exception as e:
             cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}")
 
@@ -451,7 +439,7 @@ def wait_for_instances(instance_ids):
         try:
             instance_ids = [instance_id for instance_id, _ in resources["instances_info"]]
             ec2_cli.terminate_instances(InstanceIds=instance_ids)
-            print(f"Terminating instances: {instance_ids}")
+            LOGGER.info(f"Terminating instances: {instance_ids}")
 
             if not wait_for_instances(instance_ids):
                 cleanup_errors.append("Instances did not terminate within expected timeframe")
@@ -472,7 +460,7 @@ def wait_for_instances(instance_ids):
     if resources.get("fsx_config"):
         try:
             fsx.delete_fsx_filesystem(resources["fsx_config"]["filesystem_id"])
-            print(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}")
+            LOGGER.info(f"Deleted FSx filesystem: {resources['fsx_config']['filesystem_id']}")
         except Exception as e:
             cleanup_errors.append(f"Failed to delete FSx filesystem: {str(e)}")
 
@@ -483,7 +471,7 @@ def wait_for_instances(instance_ids):
         for attempt in range(max_attempts):
             try:
                 ec2_cli.delete_security_group(GroupId=resources["sg_fsx"])
-                print(f"Deleted security group: {resources['sg_fsx']}")
+                LOGGER.info(f"Deleted security group: {resources['sg_fsx']}")
                 break
             except Exception as e:
                 if attempt == max_attempts - 1:
@@ -491,7 +479,7 @@ def wait_for_instances(instance_ids):
                         f"Failed to delete security group after {max_attempts} attempts: {str(e)}"
                     )
                 else:
-                    print(f"Retry {attempt + 1}/{max_attempts} to delete security group")
+                    LOGGER.info(f"Retry {attempt + 1}/{max_attempts} to delete security group")
                     time.sleep(30)
 
     if cleanup_errors:
@@ -515,7 +503,7 @@ def launch_ec2_instances(ec2_cli, image):
         availability_zone_options=az_options,
         is_arm64=is_arm64,
     )
-    print(f"Launched instances: {instances_info}")
+    LOGGER.info(f"Launched instances: {instances_info}")
     return instances_info
 
 
@@ -541,7 +529,7 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info)
             fsx_name,
             "Security group for FSx Lustre VLLM EC2 Tests",
         )
-        print(f"Created FSx security group: {sg_fsx}")
+        LOGGER.info(f"Created FSx security group: {sg_fsx}")
 
         # Get instance IDs from instances_info
         instance_ids = [instance_id for instance_id, _ in instances_info]
@@ -552,7 +540,7 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info)
         return sg_fsx
 
     except Exception as e:
-        print(f"Error configuring security groups: {str(e)}")
+        LOGGER.error(f"Error configuring security groups: {str(e)}")
         raise
 
 
@@ -603,7 +591,7 @@ def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_
 
 def setup(image):
     """Main setup function for VLLM on EC2 with FSx"""
-    print("Testing vllm on ec2........")
+    LOGGER.info("Testing vllm on ec2........")
     fsx = FsxSetup(DEFAULT_REGION)
     ec2_cli = get_ec2_client(DEFAULT_REGION)
     resources = {"instances_info": None, "fsx_config": None, "sg_fsx": None}
@@ -616,7 +604,7 @@ def setup(image):
         resources["instances_info"] = instance_result["instances"]
         resources["elastic_ips"] = instance_result["elastic_ips"]
         resources["connection_params"] = instance_result["connection_params"]
-        print("Waiting 60 seconds for instances to initialize...")
+        LOGGER.info("Waiting 60 seconds for instances to initialize...")
         time.sleep(60)
 
         instance_ids = [instance_id for instance_id, _ in resources["instances_info"]]
@@ -632,7 +620,7 @@ def setup(image):
             "SCRATCH_2",
             {"Name": f"fsx-lustre-vllm-ec2-test-{instance_ids[0]}-{TEST_ID}"},
         )
-        print("Created FSx filesystem")
+        LOGGER.info("Created FSx filesystem")
 
         master_instance_id, master_key_filename = resources["instances_info"][0]
         setup_instance(
@@ -642,7 +630,7 @@ def setup(image):
             resources["fsx_config"]["dns_name"],
             resources["fsx_config"]["mount_name"],
         )
-        print(f"Setup completed for master instance {master_instance_id}")
+        LOGGER.info(f"Setup completed for master instance {master_instance_id}")
 
         if len(resources["instances_info"]) > 1:
             worker_instance_id, worker_key_filename = resources["instances_info"][1]
@@ -653,12 +641,12 @@ def setup(image):
                 resources["fsx_config"]["dns_name"],
                 resources["fsx_config"]["mount_name"],
             )
-            print(f"FSx mounted on worker instance {worker_instance_id}")
+            LOGGER.info(f"FSx mounted on worker instance {worker_instance_id}")
 
         return resources
 
     except Exception as e:
-        print(f"Error during setup: {str(e)}")
+        LOGGER.error(f"Error during setup: {str(e)}")
         cleanup_resources(ec2_cli, resources, fsx)
         raise
 
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index cb1a8bd2be1f..8c26e32e6271 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -214,15 +214,15 @@ def _setup_multinode_efa_instances(
     build_all_reduce_perf_promises = []
     # Run container
     _setup_container(master_connection, image, master_container_name)
-
-    # Verify files are visible inside container
-    print(f"Verifying files inside {master_container_name} container...")
-    run_cmd_on_container(
-        master_container_name,
-        master_connection,
-        "ls -la /test/v2/ec2/efa/",
-        hide=False,
-    )
+    
+    # Uncomment to verify container file structure in case of path issues
+    # LOGGER.info(f"Verifying files inside {master_container_name} container")
+    # run_cmd_on_container(
+    #     master_container_name,
+    #     master_connection,
+    #     "ls -la /test/v2/ec2/efa/",
+    #     hide=False,
+    # )
 
     # Build all_reduce_perf binary using nccl-tests
     promise = run_cmd_on_container(
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index c66376cdcb06..be6982188cdf 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -92,10 +92,10 @@ def wait_for_container_ready(connection, container_name, timeout: int = 1000) ->
                 """
                 result = connection.run(curl_cmd, hide=False)
                 if result.ok:
-                    print("Model endpoint is responding")
-                    print("\n=== Complete vLLM Server Log ===")
+                    LOGGER.info("Model endpoint is responding")
+                    LOGGER.info("\n=== Complete vLLM Server Log ===")
                     connection.run(f"docker exec {container_name} cat vllm.log", hide=False)
-                    print("=== End of Log ===\n")
+                    LOGGER.info("=== End of Log ===\n")
                     model_ready = True
                     return True
             except Exception:
@@ -106,7 +106,7 @@ def wait_for_container_ready(connection, container_name, timeout: int = 1000) ->
 def setup_docker_image(conn, image_uri):
     account_id = get_account_id_from_image_uri(image_uri)
     login_to_ecr_registry(conn, account_id, DEFAULT_REGION)
-    print(f"Pulling image: {image_uri}")
+    LOGGER.info(f"Pulling image: {image_uri}")
     conn.run(f"docker pull {image_uri}", hide="out")
 
 
@@ -135,7 +135,7 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_
         worker_ip = worker_connection.run("hostname -i").stdout.strip()
 
         container_name = "ray_head-" + TEST_ID
-        print("Starting head node...")
+        LOGGER.info("Starting head node...")
         head_connection.run(
             f"./head_node_setup.sh {image_uri} {hf_token} {head_ip} {container_name}"
         )
@@ -154,11 +154,11 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_
             f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'"
         )
 
-        print("Waiting for model to be ready, approx estimated time to complete is 15 mins...")
+        LOGGER.info("Waiting for model to be ready, approx estimated time to complete is 15 mins...")
         if not wait_for_container_ready(head_connection, container_name, timeout=2000):
             raise Exception("Container failed to become ready within timeout period")
 
-        print("Running benchmark...")
+        LOGGER.info("Running benchmark...")
         benchmark_cmd = "source vllm_env/bin/activate &&" + create_benchmark_command()
         benchmark_result = head_connection.run(benchmark_cmd, timeout=7200)
 
@@ -182,19 +182,19 @@ def verify_gpu_setup(connection):
         # Check nvidia-smi
         result = connection.run("nvidia-smi", hide=True)
         if result.failed:
-            print("nvidia-smi check failed")
+            LOGGER.info("nvidia-smi check failed")
             return False
 
         # Check CUDA availability
         cuda_check = connection.run("nvidia-smi -L", hide=True)
         if cuda_check.failed or "GPU" not in cuda_check.stdout:
-            print("No GPUs found")
+            LOGGER.info("No GPUs found")
             return False
 
         return True
 
     except Exception as e:
-        print(f"GPU verification failed: {str(e)}")
+        LOGGER.info(f"GPU verification failed: {str(e)}")
         return False
 
 
@@ -206,7 +206,7 @@ def cleanup_containers(connection):
         connection: Fabric connection object
     """
     try:
-        print("Cleaning up containers and images...")
+        LOGGER.info("Cleaning up containers and images...")
         commands = [
             "docker ps -aq | xargs -r docker stop",
             "docker ps -aq | xargs -r docker rm",
@@ -214,7 +214,7 @@ def cleanup_containers(connection):
         for cmd in commands:
             connection.run(cmd, hide=True, warn=True)
     except Exception as e:
-        print(f"Cleanup warning: {str(e)}")
+        LOGGER.error(f"Cleanup warning: {str(e)}")
 
 
 def run_multi_node_test(head_conn, worker_conn, image_uri):
@@ -227,7 +227,7 @@ def run_multi_node_test(head_conn, worker_conn, image_uri):
         image_uri: ECR image URI
     """
 
-    print("\n=== Starting Multi-Node Test ===")
+    LOGGER.info("\n=== Starting Multi-Node Test ===")
     verification_tasks = [(head_conn, "head"), (worker_conn, "worker")]
     for conn, node_type in verification_tasks:
         if not verify_gpu_setup(conn):
@@ -235,7 +235,7 @@ def run_multi_node_test(head_conn, worker_conn, image_uri):
 
     result = test_vllm_benchmark_on_multi_node(head_conn, worker_conn, image_uri)
     if result.ok:
-        print("Multi-node test completed successfully")
+        LOGGER.info("Multi-node test completed successfully")
         return True
     return False
 
@@ -275,11 +275,11 @@ def run_single_node_test(head_conn, image_uri):
         )
 
     except Exception as e:
-        print(f"Test execution failed: {str(e)}")
+        LOGGER.error(f"Test execution failed: {str(e)}")
         raise
 
     if result.ok:
-        print("Single-node test completed successfully")
+        LOGGER.info("Single-node test completed successfully")
         return True
 
 
@@ -315,7 +315,7 @@ def test_vllm_on_ec2(resources, image_uri):
 
         # Recreate connections from stored parameters if available, otherwise create new ones
         if "connection_params" in resources and resources["connection_params"]:
-            print("Recreating connections from stored parameters")
+            LOGGER.info("Recreating connections from stored parameters")
             # Recreate fresh Connection objects from parameters stored during setup_test_artifacts()
             for params in resources["connection_params"]:
                 try:
@@ -325,12 +325,12 @@ def test_vllm_on_ec2(resources, image_uri):
                         connect_kwargs={"key_filename": params["key_filename"]},
                     )
                     ec2_connections[params["instance_id"]] = connection
-                    print(f"Recreated connection to instance {params['instance_id']}")
+                    LOGGER.info(f"Recreated connection to instance {params['instance_id']}")
                 except Exception as e:
-                    print(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}")
+                    LOGGER.error(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}")
                     raise
         else:
-            print("Creating new connections to instances")
+            LOGGER.info("Creating new connections to instances")
             for instance_id, key_filename in resources["instances_info"]:
                 try:
                     instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])[
@@ -350,16 +350,16 @@ def test_vllm_on_ec2(resources, image_uri):
                     ec2_connections[instance_id] = connection
 
                 except Exception as e:
-                    print(f"Failed to connect to instance {instance_id}: {str(e)}")
+                    LOGGER.error(f"Failed to connect to instance {instance_id}: {str(e)}")
                     raise
 
         # Verify all connections are working
         for instance_id, conn in ec2_connections.items():
             try:
                 conn.run('echo "Connection test"', hide=True)
-                print(f"Successfully verified connection to instance {instance_id}")
+                LOGGER.info(f"Successfully verified connection to instance {instance_id}")
             except Exception as e:
-                print(f"Connection test failed for instance {instance_id}: {str(e)}")
+                LOGGER.error(f"Connection test failed for instance {instance_id}: {str(e)}")
                 raise
 
         is_arm64 = "arm64" in image_uri
@@ -367,27 +367,16 @@ def test_vllm_on_ec2(resources, image_uri):
         head_conn = ec2_connections[instance_ids[0]]
 
         if is_arm64:
-            print("\n=== Starting ARM64 Single Node Test ===")
+            LOGGER.info("\n=== Starting ARM64 Single Node Test ===")
             test_results["single_node"] = run_single_node_test(head_conn, image_uri)
-            print(
+            LOGGER.info(
                 f"ARM64 Single node test: {'Passed' if test_results['single_node'] else 'Failed'}"
             )
 
         elif len(ec2_connections) >= 2:
             worker_conn = ec2_connections[instance_ids[1]]
 
-            # Verify test files exist before starting containers
-            print("\n=== Verifying test files on EC2 instances ===")
-            for conn_id, conn in ec2_connections.items():
-                result = conn.run("ls -la $HOME/test/v2/ec2/efa/", warn=True)
-                if result.failed:
-                    raise Exception(
-                        f"Test files not found at $HOME/test/v2/ec2/efa/ on instance {conn_id}"
-                    )
-                print(f"Instance {conn_id} test files:")
-                print(result.stdout)
-
-            print("\n=== Starting EFA Tests ===")
+            LOGGER.info("\n=== Starting EFA Tests ===")
             _setup_multinode_efa_instances(
                 image_uri,
                 resources["instances_info"][:2],
@@ -417,22 +406,22 @@ def test_vllm_on_ec2(resources, image_uri):
             for conn in [head_conn, worker_conn]:
                 cleanup_containers(conn)
 
-            print("EFA tests completed successfully")
+            LOGGER.info("EFA tests completed successfully")
 
             # Run multi-node test
             test_results["multi_node"] = run_multi_node_test(head_conn, worker_conn, image_uri)
 
         else:
-            print("\nSkipping multi-node test: insufficient instances")
+            LOGGER.info("\nSkipping multi-node test: insufficient instances")
 
-        print("\n=== Test Summary ===")
+        LOGGER.info("\n=== Test Summary ===")
         for test_name, result in test_results.items():
             if result is not None:
-                print(
+                LOGGER.info(
                     f"{test_name.replace('_', ' ').title()} test: {'Passed' if result else 'Failed'}"
                 )
             else:
-                print(f"{test_name.replace('_', ' ').title()} test: Not Run")
+                LOGGER.info(f"{test_name.replace('_', ' ').title()} test: Not Run")
 
         if is_arm64:
             if not test_results["single_node"]:
@@ -441,5 +430,5 @@ def test_vllm_on_ec2(resources, image_uri):
             raise Exception("All tests failed")
 
     except Exception as e:
-        print(f"Test execution failed: {str(e)}")
+        LOGGER.error(f"Test execution failed: {str(e)}")
         raise

From fa34264fd901d8c4a633bf559833c92c76815fc9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 14:58:13 -0700
Subject: [PATCH 24/33] formatting

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 18 ++++++++++--------
 test/test_utils/__init__.py            |  2 +-
 test/v2/ec2/efa/test_efa.py            |  4 ++--
 test/v2/ec2/vllm/test_ec2.py           |  8 ++++++--
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 2f08600b4a90..57d2227fd788 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -164,12 +164,14 @@ def setup_test_artifacts(ec2_client, instances, key_filename, region):
             ec2_connections[instance_id] = connection
 
             # Store connection parameters for later recreation
-            connection_params.append({
-                "instance_id": instance_id,
-                "host": public_ip,
-                "user": "ec2-user",
-                "key_filename": key_filename
-            })
+            connection_params.append(
+                {
+                    "instance_id": instance_id,
+                    "host": public_ip,
+                    "user": "ec2-user",
+                    "key_filename": key_filename,
+                }
+            )
 
             if not master_connection:
                 master_connection = connection
@@ -196,7 +198,7 @@ def delete_s3_artifact_copy():
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for master")
-            
+
             master_connection.run(
                 f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
             )
@@ -207,7 +209,7 @@ def delete_s3_artifact_copy():
                 f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} {INSTANCE_TEST_BASE_PATH} --region {test_utils.TEST_TRANSFER_S3_BUCKET_REGION}"
             )
             LOGGER.info(f"Successfully copying {test_utils.TEST_TRANSFER_S3_BUCKET} for worker")
-            
+
             worker_connection.run(
                 f"mkdir -p {INSTANCE_TEST_BASE_PATH}/v2/logs && chmod -R +x {INSTANCE_TEST_BASE_PATH}/v2/*"
             )
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 54461488a75f..198050e8823a 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -1309,7 +1309,7 @@ def upload_tests_to_s3(testname_datetime_suffix):
         raise EnvironmentError("Test is being run from wrong path")
     while os.path.basename(path) != "dlc_tests":
         path = os.path.dirname(path)
-    
+
     # If if new test structure is enabled, upload only v2 directory for new test structure
     if is_new_test_structure_enabled():
         v2_path = os.path.join(os.path.dirname(path), "v2")
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 8c26e32e6271..062259093b94 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -214,7 +214,7 @@ def _setup_multinode_efa_instances(
     build_all_reduce_perf_promises = []
     # Run container
     _setup_container(master_connection, image, master_container_name)
-    
+
     # Uncomment to verify container file structure in case of path issues
     # LOGGER.info(f"Verifying files inside {master_container_name} container")
     # run_cmd_on_container(
@@ -333,7 +333,7 @@ def _setup_container(connection, docker_image, container_name):
             f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
             f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash"
         )
-    
+
     LOGGER.info(f"Container {container_name} started successfully")
 
 
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index be6982188cdf..04de0d70028b 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -154,7 +154,9 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_
             f"docker exec -i {container_name} /bin/bash -c '{serve_command} > vllm.log 2>&1 &'"
         )
 
-        LOGGER.info("Waiting for model to be ready, approx estimated time to complete is 15 mins...")
+        LOGGER.info(
+            "Waiting for model to be ready, approx estimated time to complete is 15 mins..."
+        )
         if not wait_for_container_ready(head_connection, container_name, timeout=2000):
             raise Exception("Container failed to become ready within timeout period")
 
@@ -327,7 +329,9 @@ def test_vllm_on_ec2(resources, image_uri):
                     ec2_connections[params["instance_id"]] = connection
                     LOGGER.info(f"Recreated connection to instance {params['instance_id']}")
                 except Exception as e:
-                    LOGGER.error(f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}")
+                    LOGGER.error(
+                        f"Failed to recreate connection to instance {params['instance_id']}: {str(e)}"
+                    )
                     raise
         else:
             LOGGER.info("Creating new connections to instances")

From bbcc859b37ab4197606c7a036020215deae97c5f Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 15:03:43 -0700
Subject: [PATCH 25/33] Clean up unused imports

---
 infra/test_infra/ec2/utils.py                        |  1 -
 infra/test_infra/ec2/vllm/fsx_utils.py               |  3 +--
 .../test_infra/validators/base_platform_validator.py |  3 +--
 test/v2/ec2/efa/test_efa.py                          | 12 +-----------
 test/v2/ec2/vllm/test_ec2.py                         |  1 -
 5 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/infra/test_infra/ec2/utils.py b/infra/test_infra/ec2/utils.py
index ec3204d46ec0..30c8430b7ba5 100644
--- a/infra/test_infra/ec2/utils.py
+++ b/infra/test_infra/ec2/utils.py
@@ -1,7 +1,6 @@
 import os
 import time
 import re
-import sys
 import uuid
 import copy
 
diff --git a/infra/test_infra/ec2/vllm/fsx_utils.py b/infra/test_infra/ec2/vllm/fsx_utils.py
index a7b1b0dc4446..eed3635550b1 100644
--- a/infra/test_infra/ec2/vllm/fsx_utils.py
+++ b/infra/test_infra/ec2/vllm/fsx_utils.py
@@ -1,7 +1,6 @@
 import time
 from invoke import run
-from typing import Dict, List, Any
-import boto3
+from typing import Dict, List
 from botocore.exceptions import ClientError
 
 from infra.test_infra.test_infra_utils import create_logger
diff --git a/infra/test_infra/validators/base_platform_validator.py b/infra/test_infra/validators/base_platform_validator.py
index 3f78e00f57d3..3353c721ef22 100644
--- a/infra/test_infra/validators/base_platform_validator.py
+++ b/infra/test_infra/validators/base_platform_validator.py
@@ -1,7 +1,6 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import List, Dict, Type
-from dataclasses import dataclass
+from typing import List, Dict
 
 
 class BasePlatformValidator(ABC):
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 062259093b94..43ef77c563d8 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -214,16 +214,6 @@ def _setup_multinode_efa_instances(
     build_all_reduce_perf_promises = []
     # Run container
     _setup_container(master_connection, image, master_container_name)
-
-    # Uncomment to verify container file structure in case of path issues
-    # LOGGER.info(f"Verifying files inside {master_container_name} container")
-    # run_cmd_on_container(
-    #     master_container_name,
-    #     master_connection,
-    #     "ls -la /test/v2/ec2/efa/",
-    #     hide=False,
-    # )
-
     # Build all_reduce_perf binary using nccl-tests
     promise = run_cmd_on_container(
         master_container_name,
@@ -333,7 +323,7 @@ def _setup_container(connection, docker_image, container_name):
             f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
             f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash"
         )
-
+    
     LOGGER.info(f"Container {container_name} started successfully")
 
 
diff --git a/test/v2/ec2/vllm/test_ec2.py b/test/v2/ec2/vllm/test_ec2.py
index 04de0d70028b..54e2cfb12005 100644
--- a/test/v2/ec2/vllm/test_ec2.py
+++ b/test/v2/ec2/vllm/test_ec2.py
@@ -16,7 +16,6 @@
 from test.v2.ec2.efa.test_efa import (
     _setup_multinode_efa_instances,
     EFA_SANITY_TEST_CMD,
-    MASTER_CONTAINER_NAME,
     HOSTS_FILE_LOCATION,
     EFA_INTEGRATION_TEST_CMD,
     DEFAULT_EFA_TIMEOUT,

From 22b31bfe227318faf8ab76bb0038af0614334d37 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 15:16:15 -0700
Subject: [PATCH 26/33] remove unused code for vllm

---
 test/v2/ec2/efa/test_efa.py | 112 ------------------------------------
 1 file changed, 112 deletions(-)

diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 43ef77c563d8..88a1397b19f3 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -1,21 +1,12 @@
 import os
-
-import pytest
-
 import test.test_utils.ec2 as ec2_utils
 from test.test_utils import (
     CONTAINER_TESTS_PREFIX_V2,
     get_account_id_from_image_uri,
     get_region_from_image_uri,
-    is_pr_context,
-    is_efa_dedicated,
-    are_heavy_instance_ec2_tests_enabled,
     login_to_ecr_registry,
     run_cmd_on_container,
 )
-from packaging.version import Version
-from packaging.specifiers import SpecifierSet
-
 from infra.test_infra.ec2.utils import (
     get_efa_ec2_instance_type,
     filter_efa_instance_type,
@@ -78,109 +69,6 @@ def get_efa_container_name(framework, test_scenario, arch_type, node_role=None):
 )
 
 
-# TODO: decide on whether to keep this commented out or left out until actual implementation of each framework
-# def test_pytorch_efa(
-#     pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only
-# ):
-#     """
-#     Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA
-#     on multiple nodes using DLC images. The test scripts are agnostic to the framework and version
-#     installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf
-#     binary necessary for multinode tests, on each node.
-#     Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances
-#     on pipelines.
-#     :param pytorch_training: str PyTorch Training DLC image URI
-#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-#     :param ec2_instance_type: str Instance Type being tested
-#     :param region: str Region in which EFA-enabled instances are launched
-#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
-#     """
-#     number_of_nodes = 2
-#     _setup_multinode_efa_instances(
-#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-#     )
-#     master_connection = efa_ec2_connections[0]
-#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-
-#     ipv6_arg = "True" if ENABLE_IPV6_TESTING else ""
-
-#     run_cmd_on_container(
-#         MASTER_CONTAINER_NAME,
-#         master_connection,
-#         f"{EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}",
-#         hide=False,
-#         timeout=DEFAULT_EFA_TIMEOUT,
-#     )
-
-
-# def test_efa_tensorflow(
-#     tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, gpu_only
-# ):
-#     """
-#     Run EFA Sanity tests on DLC, and then run NCCL Message Transfer and All Reduce tests using EFA
-#     on multiple nodes using DLC images. The test scripts are agnostic to the framework and version
-#     installed in the DLC image. The test also builds nccl-tests to create the all_reduce_perf
-#     binary necessary for multinode tests, on each node.
-#     Note: This test must be explicitly enabled on CI, and will only run on EFA-capable instances
-#     on pipelines.
-#     :param tensorflow_training: str PyTorch Training DLC image URI
-#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-#     :param ec2_instance_type: str Instance Type being tested
-#     :param region: str Region in which EFA-enabled instances are launched
-#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
-#     """
-#     number_of_nodes = 2
-#     _setup_multinode_efa_instances(
-#         tensorflow_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-#     )
-#     master_connection = efa_ec2_connections[0]
-#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-
-#     # pass IPv6 flag if enabled
-#     ipv6_arg = "True" if ENABLE_IPV6_TESTING else ""
-
-#     run_cmd_on_container(
-#         MASTER_CONTAINER_NAME,
-#         master_connection,
-#         f"export CUDA_HOME='/usr/local/cuda'; {EFA_INTEGRATION_TEST_CMD} {HOSTS_FILE_LOCATION} {number_of_nodes} {ipv6_arg}",
-#         hide=False,
-#         timeout=DEFAULT_EFA_TIMEOUT,
-#     )
-
-
-# def test_pytorch_efa_healthcheck(
-#     pytorch_training,
-#     efa_ec2_instances,
-#     efa_ec2_connections,
-#     ec2_instance_type,
-#     region,
-#     gpu_only,
-# ):
-#     """
-#     Run EFA Health Check tests on DLC.
-#     :param pytorch_training: str PyTorch Training DLC image URI
-#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-#     :param ec2_instance_type: str Instance Type being tested
-#     :param region: str Region in which EFA-enabled instances are launched
-#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
-#     """
-#     _setup_multinode_efa_instances(
-#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-#     )
-#     master_connection = efa_ec2_connections[0]
-#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-#     run_cmd_on_container(
-#         MASTER_CONTAINER_NAME,
-#         master_connection,
-#         f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
-#         hide=False,
-#         timeout=DEFAULT_EFA_TIMEOUT,
-#     )
-
-
 def _setup_multinode_efa_instances(
     image, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region, arch_type=None
 ):

From 6c0103258ef57e4fbe001063f1c83ca689967cf6 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 15:17:17 -0700
Subject: [PATCH 27/33] rerun ec2 and eks using new path

---
 vllm/buildspec.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index e79fe18ff8f8..a007d441998e 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -59,9 +59,9 @@ images:
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
-      # - platform: eks
-      #   params:
-      #     cluster: dlc-vllm
-      #     namespace: vllm
-      #   run:
-      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+      - platform: eks
+        params:
+          cluster: dlc-vllm
+          namespace: vllm
+        run:
+          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file

From c652b7669c5cae410df013d727aef548443e34c5 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 17:12:03 -0700
Subject: [PATCH 28/33] change logger level

---
 infra/test_infra/ec2/vllm/setup_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 57d2227fd788..29382bb7bba4 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -433,7 +433,7 @@ def wait_for_instances(instance_ids):
     if resources.get("elastic_ips"):
         try:
             delete_elastic_ips(resources["elastic_ips"], ec2_cli)
-            LOGGER.error(f"Deleted elastic IPs: {resources['elastic_ips']}")
+            LOGGER.info(f"Deleted elastic IPs: {resources['elastic_ips']}")
         except Exception as e:
             cleanup_errors.append(f"Failed to cleanup Elastic IPs: {str(e)}")
 

From 2680d2f5ad69ba5d45be5f0afc09c6d1644cd94d Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 17:20:44 -0700
Subject: [PATCH 29/33] fix the hardcoded test command by adding test registry

---
 infra/test_infra/ec2/setup.py | 37 ++++++++++++++++++++++++++---------
 vllm/buildspec.yml            | 12 ++++++------
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index 102563cd2519..70c36fc9b8ba 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -6,6 +6,17 @@
 
 LOGGER = create_logger(__name__)
 
+# Registry for framework-specific test execution
+# Frameworks requiring direct Python function calls with resources should be registered here
+TEST_REGISTRY = {
+    "vllm": {
+        "module": "test.v2.ec2.vllm.test_ec2",
+        "function": "test_vllm_on_ec2",
+        "requires_resources": True,
+    },
+    # Future frameworks to be added here
+}
+
 
 class EC2Platform:
     def __init__(self):
@@ -54,17 +65,25 @@ def execute_command(self, cmd):
                 "FRAMEWORK": self.framework,
             }
 
-            # Check if this is a vLLM test command
-            # TODO: check if there is a better way to handle this
-            if self.framework == "vllm" and "test/v2/ec2/vllm/test_ec2.py" in cmd:
-                LOGGER.info(f"Executing vLLM test via direct call: {cmd}")
-                from test.v2.ec2.vllm.test_ec2 import test_vllm_on_ec2
-
-                # Pass resources and image_uri; test reads config from env vars
-                test_vllm_on_ec2(self.resources, self.image_uri)
+            # Check registry for framework-specific handling
+            test_config = TEST_REGISTRY.get(self.framework)
+            
+            if test_config and test_config.get("requires_resources"):
+                # Direct Python function call for tests requiring resource access
+                LOGGER.info(f"Executing {self.framework} test via direct call: {cmd}")
+                
+                module_path = test_config["module"]
+                function_name = test_config["function"]
+                
+                # Dynamically import and call the test function
+                module = __import__(module_path, fromlist=[function_name])
+                test_function = getattr(module, function_name)
+                
+                # Pass resources and image_uri directly
+                test_function(self.resources, self.image_uri)
                 LOGGER.info(f"Command completed successfully: {cmd}")
             else:
-                # Standard shell command execution for other cases
+                # Standard shell command execution
                 repo_root = get_cloned_folder_path()
 
                 with self.ctx.cd(repo_root):
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index a007d441998e..e79fe18ff8f8 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -59,9 +59,9 @@ images:
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
-      - platform: eks
-        params:
-          cluster: dlc-vllm
-          namespace: vllm
-        run:
-          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+      # - platform: eks
+      #   params:
+      #     cluster: dlc-vllm
+      #     namespace: vllm
+      #   run:
+      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file

From eea38330a5c48f3ca190e83008aea41dd4725afe Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Mon, 20 Oct 2025 22:42:05 -0700
Subject: [PATCH 30/33] test current test path to confirm still working
 correctly

---
 dlc_developer_config.toml |  2 +-
 vllm/buildspec.yml        | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 6bfe70c08d6d..e433e779db14 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = true 
+use_new_test_structure = false 
 
 ### On by default
 sanity_tests = true
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index e79fe18ff8f8..a007d441998e 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -59,9 +59,9 @@ images:
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
-      # - platform: eks
-      #   params:
-      #     cluster: dlc-vllm
-      #     namespace: vllm
-      #   run:
-      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+      - platform: eks
+        params:
+          cluster: dlc-vllm
+          namespace: vllm
+        run:
+          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file

From 219e27cf9e678124969f18b14da5e23133722331 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Tue, 21 Oct 2025 00:16:01 -0700
Subject: [PATCH 31/33] test with added instance type parsing check against
 assigned instance type

---
 dlc_developer_config.toml     |  2 +-
 infra/test_infra/ec2/setup.py | 21 +++++++++++++++++++++
 vllm/buildspec.yml            | 14 +++++++-------
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index e433e779db14..6bfe70c08d6d 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = false 
+use_new_test_structure = true 
 
 ### On by default
 sanity_tests = true
diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index 70c36fc9b8ba..46903b9ebad9 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -37,8 +37,29 @@ def setup(self, params):
         self.framework = params.get("framework")
         self.arch_type = params.get("arch_type", "x86_64")
         self.image_uri = params.get("image_uri")
+        self.instance_type = params.get("instance_type")
+        self.node_count = params.get("node_count")
 
         if self.framework == "vllm":
+            # Validate buildspec params match hardcoded logic for vLLM
+            is_arm64 = "arm64" in self.image_uri
+            expected_instance_type = "g5g.16xlarge" if is_arm64 else "p4d.24xlarge"
+            expected_node_count = 1 if is_arm64 else 2
+            
+            if self.instance_type and self.instance_type != expected_instance_type:
+                LOGGER.warning(
+                    f"Buildspec instance_type '{self.instance_type}' differs from "
+                    f"hardcoded value '{expected_instance_type}'. Using hardcoded value."
+                )
+            
+            # Note: The platform validator already enforces node_count == 2 for multi-node EFA tests,
+            # so the node_count check below would only trigger if the validator is bypassed or modified
+            if self.node_count and self.node_count != expected_node_count:
+                LOGGER.warning(
+                    f"Buildspec node_count '{self.node_count}' differs from "
+                    f"hardcoded value '{expected_node_count}'. Using hardcoded value."
+                )
+            
             # vLLM requires vLLM-specific setup (FSx + multi-node)
             LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}")
             from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index a007d441998e..efa0c07b97cf 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -54,14 +54,14 @@ images:
     tests:
       - platform: ec2-multi-node-efa
         params:
-          instance_type: p4d.24xlarge
+          instance_type: p5.48xlarge
           node_count: 2
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
-      - platform: eks
-        params:
-          cluster: dlc-vllm
-          namespace: vllm
-        run:
-          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+      # - platform: eks
+      #   params:
+      #     cluster: dlc-vllm
+      #     namespace: vllm
+      #   run:
+      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file

From 842453e7a3fa285e861f0396b379778585904bd9 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Tue, 21 Oct 2025 14:43:54 -0700
Subject: [PATCH 32/33] formatting and clean up code, rerun test with
 unexpected instance type

---
 infra/test_infra/ec2/setup.py          | 18 +++++++++---------
 infra/test_infra/ec2/vllm/setup_ec2.py | 20 ++++++++++----------
 test/v2/ec2/efa/test_efa.py            |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/infra/test_infra/ec2/setup.py b/infra/test_infra/ec2/setup.py
index 46903b9ebad9..26cdb652dad9 100644
--- a/infra/test_infra/ec2/setup.py
+++ b/infra/test_infra/ec2/setup.py
@@ -42,16 +42,16 @@ def setup(self, params):
 
         if self.framework == "vllm":
             # Validate buildspec params match hardcoded logic for vLLM
-            is_arm64 = "arm64" in self.image_uri
+            is_arm64 = self.arch_type == "arm64"
             expected_instance_type = "g5g.16xlarge" if is_arm64 else "p4d.24xlarge"
             expected_node_count = 1 if is_arm64 else 2
-            
+
             if self.instance_type and self.instance_type != expected_instance_type:
                 LOGGER.warning(
                     f"Buildspec instance_type '{self.instance_type}' differs from "
                     f"hardcoded value '{expected_instance_type}'. Using hardcoded value."
                 )
-            
+
             # Note: The platform validator already enforces node_count == 2 for multi-node EFA tests,
             # so the node_count check below would only trigger if the validator is bypassed or modified
             if self.node_count and self.node_count != expected_node_count:
@@ -59,12 +59,12 @@ def setup(self, params):
                     f"Buildspec node_count '{self.node_count}' differs from "
                     f"hardcoded value '{expected_node_count}'. Using hardcoded value."
                 )
-            
+
             # vLLM requires vLLM-specific setup (FSx + multi-node)
             LOGGER.info(f"Setting up vLLM infrastructure for image: {self.image_uri}")
             from infra.test_infra.ec2.vllm.setup_ec2 import setup as vllm_setup
 
-            self.resources = vllm_setup(self.image_uri)
+            self.resources = vllm_setup(self.image_uri, self.arch_type)
             LOGGER.info("vLLM setup completed successfully")
         else:
             # standard EC2 setup for other frameworks
@@ -88,18 +88,18 @@ def execute_command(self, cmd):
 
             # Check registry for framework-specific handling
             test_config = TEST_REGISTRY.get(self.framework)
-            
+
             if test_config and test_config.get("requires_resources"):
                 # Direct Python function call for tests requiring resource access
                 LOGGER.info(f"Executing {self.framework} test via direct call: {cmd}")
-                
+
                 module_path = test_config["module"]
                 function_name = test_config["function"]
-                
+
                 # Dynamically import and call the test function
                 module = __import__(module_path, fromlist=[function_name])
                 test_function = getattr(module, function_name)
-                
+
                 # Pass resources and image_uri directly
                 test_function(self.resources, self.image_uri)
                 LOGGER.info(f"Command completed successfully: {cmd}")
diff --git a/infra/test_infra/ec2/vllm/setup_ec2.py b/infra/test_infra/ec2/vllm/setup_ec2.py
index 29382bb7bba4..e45ace2b1be6 100644
--- a/infra/test_infra/ec2/vllm/setup_ec2.py
+++ b/infra/test_infra/ec2/vllm/setup_ec2.py
@@ -45,15 +45,15 @@
 TEST_ID = str(uuid.uuid4())
 
 
-def ec2_instance_ami(region, image):
-    if "arm64" in image:
+def ec2_instance_ami(region, arch_type):
+    if arch_type == "arm64":
         return AL2023_BASE_DLAMI_ARM64_US_WEST_2
 
     return test_utils.get_dlami_id(region)
 
 
-def ec2_instance_type(image):
-    if "arm64" in image:
+def ec2_instance_type(arch_type):
+    if arch_type == "arm64":
         return "g5g.16xlarge"
     else:
         return "p4d.24xlarge"
@@ -488,12 +488,12 @@ def wait_for_instances(instance_ids):
         raise Exception("Cleanup errors occurred:\n" + "\n".join(cleanup_errors))
 
 
-def launch_ec2_instances(ec2_cli, image):
+def launch_ec2_instances(ec2_cli, arch_type):
     """Launch EC2 instances with EFA support"""
-    instance_type = ec2_instance_type(image)
-    ami_id = ec2_instance_ami(DEFAULT_REGION, image)
+    instance_type = ec2_instance_type(arch_type)
+    ami_id = ec2_instance_ami(DEFAULT_REGION, arch_type)
     az_options = availability_zone_options(ec2_cli, instance_type, DEFAULT_REGION)
-    is_arm64 = True if "arm64" in image else False
+    is_arm64 = arch_type == "arm64"
 
     instances_info = efa_ec2_instances(
         ec2_client=ec2_cli,
@@ -591,7 +591,7 @@ def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_
         connection.run(cmd)
 
 
-def setup(image):
+def setup(image, arch_type):
     """Main setup function for VLLM on EC2 with FSx"""
     LOGGER.info("Testing vllm on ec2........")
     fsx = FsxSetup(DEFAULT_REGION)
@@ -602,7 +602,7 @@ def setup(image):
         vpc_id = get_default_vpc_id(ec2_cli)
         subnet_ids = get_subnet_id_by_vpc(ec2_cli, vpc_id)
 
-        instance_result = launch_ec2_instances(ec2_cli, image)
+        instance_result = launch_ec2_instances(ec2_cli, arch_type)
         resources["instances_info"] = instance_result["instances"]
         resources["elastic_ips"] = instance_result["elastic_ips"]
         resources["connection_params"] = instance_result["connection_params"]
diff --git a/test/v2/ec2/efa/test_efa.py b/test/v2/ec2/efa/test_efa.py
index 88a1397b19f3..e34fcdb604df 100644
--- a/test/v2/ec2/efa/test_efa.py
+++ b/test/v2/ec2/efa/test_efa.py
@@ -211,7 +211,7 @@ def _setup_container(connection, docker_image, container_name):
             f"docker run --runtime=nvidia --gpus all -id --name {container_name} --network host --ulimit memlock=-1:-1 "
             f"{docker_all_devices_arg} -v $HOME/test/v2:/test/v2 -v /dev/shm:/dev/shm {docker_image} bash"
         )
-    
+
     LOGGER.info(f"Container {container_name} started successfully")
 
 

From 8ed0f904b1f8fe3b35a20cd63eb98b579513b7e8 Mon Sep 17 00:00:00 2001
From: Jinyan Li <jinyali@amazon.com>
Date: Tue, 21 Oct 2025 16:12:11 -0700
Subject: [PATCH 33/33] Revert config changes

---
 dlc_developer_config.toml |  2 +-
 vllm/buildspec.yml        | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 6bfe70c08d6d..e433e779db14 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = true 
+use_new_test_structure = false 
 
 ### On by default
 sanity_tests = true
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index efa0c07b97cf..1939af0be055 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -54,14 +54,14 @@ images:
     tests:
       - platform: ec2-multi-node-efa
         params:
-          instance_type: p5.48xlarge
+          instance_type: p4d.24xlarge
           node_count: 2
         run:
           - python test/v2/ec2/vllm/test_ec2.py
         
       # - platform: eks
-      #   params:
-      #     cluster: dlc-vllm
-      #     namespace: vllm
-      #   run:
-      #     - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file
+        params:
+          cluster: dlc-vllm
+          namespace: vllm
+        run:
+          - python test/v2/eks/vllm/vllm_eks_test.py
\ No newline at end of file