From 53d09879732fd15adf350c44ca121514c406d421 Mon Sep 17 00:00:00 2001 From: Alay Dilipbhai Shah Date: Mon, 18 Mar 2024 18:37:02 -0700 Subject: [PATCH 1/2] Fail loudly and terminate if version upgrade fails --- .../computing/scheduler/comm_utils/sys_utils.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py index 64313b0864..24851acb9f 100644 --- a/python/fedml/computing/scheduler/comm_utils/sys_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/sys_utils.py @@ -787,7 +787,7 @@ def daemon_ota_upgrade(in_args): fedml_is_latest_version, local_ver, remote_ver = check_fedml_is_latest_version(in_args.version) should_upgrade = False if fedml_is_latest_version else True except Exception as e: - return + raise Exception("Failed to check the latest version with error {}.".format(str(e))) if not should_upgrade: return @@ -803,7 +803,7 @@ def daemon_ota_upgrade_with_version(in_version="release"): fedml_is_latest_version, local_ver, remote_ver = check_fedml_is_latest_version(in_version) should_upgrade = False if fedml_is_latest_version else True except Exception as e: - return + raise Exception("Failed to check the latest version with error {}.".format(str(e))) if not should_upgrade: return @@ -829,8 +829,6 @@ def run_cmd(command, show_local_console=False): print(out_str) log_return_info(command, 0) - - is_cmd_run_ok = True else: if err is not None: try: @@ -844,10 +842,8 @@ def run_cmd(command, show_local_console=False): print(err_str) log_return_info(command, ret_code) - - is_cmd_run_ok = False - - return is_cmd_run_ok + raise Exception("Run command '{}' failed with return code {}.".format(command, ret_code)) + return True def get_local_fedml_version(fedml_init_file): From 51af7eaa498bbe887c245edf645df73bf3ab4a96 Mon Sep 17 00:00:00 2001 From: Alay Shah Date: Tue, 19 Mar 2024 10:59:00 -0700 Subject: [PATCH 2/2] Raise exeception if failed to occupy gpus --- python/fedml/computing/scheduler/comm_utils/job_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py index 67bb263f81..855e17e3d8 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/job_utils.py @@ -126,8 +126,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, return cuda_visible_gpu_ids_str except Exception as e: - logging.error(f"Error {e} Exception {traceback.format_exc()}") - return None + raise Exception(f"Error occurred while occupying gpu ids: {e} \n" + f"Exception {traceback.format_exc()}") @staticmethod def search_and_refresh_available_gpu_ids(available_gpu_ids):