From 75f399cf915a35412f2ccae516a1c86bee6ea3fb Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 13 Apr 2025 10:41:32 +0000 Subject: [PATCH 1/6] restore resume check using last modified filetime Signed-off-by: Jack Luar --- flow/test/test_autotuner.sh | 3 +- tools/AutoTuner/test/resume_check.py | 43 +++++++++++++++++++++------- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh index afd4e6ed8d..1e25d9c1ce 100755 --- a/flow/test/test_autotuner.sh +++ b/flow/test/test_autotuner.sh @@ -30,8 +30,7 @@ if [ "$PLATFORM_WITHOUT_DASHES" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; the python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files echo "Running AutoTuner resume test (only once)" - # Temporarily disable resume check test due to flakiness - #python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume + python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume echo "Running AutoTuner binary check (only once)" openroad_autotuner -h diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 69eaec1f24..761315093f 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -32,6 +32,7 @@ ## POSSIBILITY OF SUCH DAMAGE. ############################################################################### +import glob import unittest import subprocess import os @@ -41,9 +42,6 @@ from contextlib import contextmanager cur_dir = os.path.dirname(os.path.abspath(__file__)) -src_dir = os.path.join(cur_dir, "../src") -orfs_dir = os.path.join(cur_dir, "../../../flow") -os.chdir(src_dir) @contextmanager @@ -65,18 +63,16 @@ class ResumeCheck(unittest.TestCase): design = "gcd" samples = 5 iterations = 2 + experiment_name = "test-resume" def setUp(self): self.config = os.path.join( - orfs_dir, "designs", self.platform, self.design, "autotuner.json" + cur_dir, + f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json", ) self.jobs = self.samples self.num_cpus = os.cpu_count() - # How it works: Say we have 5 samples and 5 iterations. - # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray) - # We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!) - # Cast to 1 decimal place res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples)) options = ["", "--resume"] @@ -87,21 +83,48 @@ def setUp(self): f" --platform {self.platform}" f" --config {self.config}" f" --jobs {self.jobs}" - f" --experiment test-resume" + f" --experiment {self.experiment_name}" f" tune --iterations {self.iterations} --samples {self.samples}" f" --resources_per_trial {res_per_trial}" f" {c}" for c in options ] + def check_trial_times(self, iteration: int = 0) -> str: + """ + Checks the nth iteration time of a trial. + + :param iteration: The iteration to check. + :return: The latest modified UNIX time of the nth iteration. + """ + if iteration < 0 or iteration >= self.iterations: + raise ValueError("Iteration must be between 0 and iterations - 1") + + experiment_dir = os.path.join( + cur_dir, + f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune", + ) + folders = glob.glob(os.path.join(experiment_dir, f"variant-*-or-{iteration}")) + return max((os.path.getmtime(folder) for folder in folders), default=9e99) + def test_tune_resume(self): # Goal is to first run the first config (without resume) and then run the second config (with resume) # and check if the run is able to complete. # Run the first config asynchronously. print("Running the first config") + latest_modified_time = 0 with managed_process(self.commands[0], shell=True) as proc: - time.sleep(120) + time.sleep(30) + # Check if first config is complete + while True: + cur_modified_time = self.check_trial_times() + print(f"Current modified time: {cur_modified_time}") + print(f"Latest modified time: {latest_modified_time}") + if abs(cur_modified_time - latest_modified_time) < 1e-6: + break + latest_modified_time = cur_modified_time + time.sleep(10) # Keep trying to stop the ray cluster until it is stopped while 1: From 0dab7e37c54f4fff684fce95c24911d3a3038a08 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Sun, 13 Apr 2025 14:28:51 +0000 Subject: [PATCH 2/6] refactor resume check: rename exec variable and improve subprocess error handling Signed-off-by: Jack Luar --- tools/AutoTuner/test/resume_check.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 761315093f..d027973d91 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -76,9 +76,9 @@ def setUp(self): # Cast to 1 decimal place res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples)) options = ["", "--resume"] - self.exec = AutoTunerTestUtils.get_exec_cmd() + self.executable_command = AutoTunerTestUtils.get_exec_cmd() self.commands = [ - f"{self.exec}" + f"{self.executable_command}" f" --design {self.design}" f" --platform {self.platform}" f" --config {self.config}" @@ -90,12 +90,13 @@ def setUp(self): for c in options ] - def check_trial_times(self, iteration: int = 0) -> str: + def check_trial_times(self, iteration: int = 0) -> int: """ Checks the nth iteration time of a trial. :param iteration: The iteration to check. :return: The latest modified UNIX time of the nth iteration. + If no folders are found, returns a default value of 9e99. """ if iteration < 0 or iteration >= self.iterations: raise ValueError("Iteration must be between 0 and iterations - 1") @@ -114,24 +115,33 @@ def test_tune_resume(self): # Run the first config asynchronously. print("Running the first config") latest_modified_time = 0 - with managed_process(self.commands[0], shell=True) as proc: + with managed_process(self.commands[0].split()) as proc: time.sleep(30) # Check if first config is complete while True: cur_modified_time = self.check_trial_times() print(f"Current modified time: {cur_modified_time}") print(f"Latest modified time: {latest_modified_time}") - if abs(cur_modified_time - latest_modified_time) < 1e-6: + if abs(cur_modified_time - latest_modified_time) < 1e-3: break latest_modified_time = cur_modified_time time.sleep(10) # Keep trying to stop the ray cluster until it is stopped while 1: - proc = subprocess.run("ray status", shell=True) + proc = subprocess.run( + "ray status", shell=True, capture_output=True, text=True + ) + if proc.returncode != 0: + print(f"Error running 'ray status': {proc.stderr}") no_nodes = proc.returncode != 0 - proc = subprocess.run("ray stop", shell=True) - successful = proc.returncode in accepted_rc + proc = subprocess.run( + "ray stop", shell=True, capture_output=True, text=True + ) + if proc.returncode not in accepted_rc: + print(f"Error running 'ray stop': {proc.stderr}") + raise RuntimeError("Failed to stop the ray cluster") + successful = True if no_nodes and successful: break From 30098941e3a030e79203f8a80cbe08e8f8068926 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Mon, 21 Apr 2025 17:25:16 +0000 Subject: [PATCH 3/6] make error clearer Signed-off-by: Jack Luar --- tools/AutoTuner/test/resume_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index d027973d91..71ca108bc4 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -99,7 +99,7 @@ def check_trial_times(self, iteration: int = 0) -> int: If no folders are found, returns a default value of 9e99. """ if iteration < 0 or iteration >= self.iterations: - raise ValueError("Iteration must be between 0 and iterations - 1") + raise ValueError("Iteration must be between 0 and (iterations - 1)") experiment_dir = os.path.join( cur_dir, From cf5ed3db679e4d1e58b9ea4e0daad670616e66bd Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Mon, 21 Apr 2025 17:28:09 +0000 Subject: [PATCH 4/6] clarify function name Signed-off-by: Jack Luar --- tools/AutoTuner/test/resume_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 71ca108bc4..747523b003 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -90,9 +90,9 @@ def setUp(self): for c in options ] - def check_trial_times(self, iteration: int = 0) -> int: + def get_trial_times(self, iteration: int = 0) -> int: """ - Checks the nth iteration time of a trial. + Returns the nth iteration time of a trial. :param iteration: The iteration to check. :return: The latest modified UNIX time of the nth iteration. From ab31e29324c62d6c62d5bb7c0fb56494b13bba8d Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Fri, 9 May 2025 16:55:33 +0000 Subject: [PATCH 5/6] fix function call Signed-off-by: Jack Luar --- tools/AutoTuner/test/resume_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 747523b003..2823edc2c8 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -119,7 +119,7 @@ def test_tune_resume(self): time.sleep(30) # Check if first config is complete while True: - cur_modified_time = self.check_trial_times() + cur_modified_time = self.get_trial_times() print(f"Current modified time: {cur_modified_time}") print(f"Latest modified time: {latest_modified_time}") if abs(cur_modified_time - latest_modified_time) < 1e-3: From e8cdcfe3404bd01ef2fb4da9b9f42e61f5fee541 Mon Sep 17 00:00:00 2001 From: Jack Luar Date: Wed, 14 May 2025 16:14:47 +0000 Subject: [PATCH 6/6] revert list comprehension into for loop for better readability Signed-off-by: Jack Luar --- tools/AutoTuner/test/resume_check.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 2823edc2c8..21604753d7 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -42,6 +42,7 @@ from contextlib import contextmanager cur_dir = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_MODIFIED_TIME = 0 @contextmanager @@ -90,13 +91,13 @@ def setUp(self): for c in options ] - def get_trial_times(self, iteration: int = 0) -> int: + def get_last_modified_time(self, iteration: int = 0) -> int: """ Returns the nth iteration time of a trial. :param iteration: The iteration to check. :return: The latest modified UNIX time of the nth iteration. - If no folders are found, returns a default value of 9e99. + If no folders are found, returns a default value. """ if iteration < 0 or iteration >= self.iterations: raise ValueError("Iteration must be between 0 and (iterations - 1)") @@ -105,8 +106,15 @@ def get_trial_times(self, iteration: int = 0) -> int: cur_dir, f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune", ) - folders = glob.glob(os.path.join(experiment_dir, f"variant-*-or-{iteration}")) - return max((os.path.getmtime(folder) for folder in folders), default=9e99) + iteration_folders = glob.glob( + os.path.join(experiment_dir, f"variant-*-or-{iteration}") + ) + latest_modified_time = DEFAULT_MODIFIED_TIME + for folder in iteration_folders: + modified_time = os.path.getmtime(folder) + if modified_time > latest_modified_time: + latest_modified_time = modified_time + return latest_modified_time def test_tune_resume(self): # Goal is to first run the first config (without resume) and then run the second config (with resume) @@ -119,7 +127,7 @@ def test_tune_resume(self): time.sleep(30) # Check if first config is complete while True: - cur_modified_time = self.get_trial_times() + cur_modified_time = self.get_last_modified_time() print(f"Current modified time: {cur_modified_time}") print(f"Latest modified time: {latest_modified_time}") if abs(cur_modified_time - latest_modified_time) < 1e-3: