From 75f399cf915a35412f2ccae516a1c86bee6ea3fb Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 13 Apr 2025 10:41:32 +0000
Subject: [PATCH 1/6] restore resume check using last modified filetime

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 flow/test/test_autotuner.sh          |  3 +-
 tools/AutoTuner/test/resume_check.py | 43 +++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh
index afd4e6ed8d..1e25d9c1ce 100755
--- a/flow/test/test_autotuner.sh
+++ b/flow/test/test_autotuner.sh
@@ -30,8 +30,7 @@ if [ "$PLATFORM_WITHOUT_DASHES" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; the
   python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
 
   echo "Running AutoTuner resume test (only once)"
-  # Temporarily disable resume check test due to flakiness
-  #python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
+  python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
 
   echo "Running AutoTuner binary check (only once)"
   openroad_autotuner -h
diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index 69eaec1f24..761315093f 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -32,6 +32,7 @@
 ## POSSIBILITY OF SUCH DAMAGE.
 ###############################################################################
 
+import glob
 import unittest
 import subprocess
 import os
@@ -41,9 +42,6 @@
 from contextlib import contextmanager
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
-src_dir = os.path.join(cur_dir, "../src")
-orfs_dir = os.path.join(cur_dir, "../../../flow")
-os.chdir(src_dir)
 
 
 @contextmanager
@@ -65,18 +63,16 @@ class ResumeCheck(unittest.TestCase):
     design = "gcd"
     samples = 5
     iterations = 2
+    experiment_name = "test-resume"
 
     def setUp(self):
         self.config = os.path.join(
-            orfs_dir, "designs", self.platform, self.design, "autotuner.json"
+            cur_dir,
+            f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json",
         )
         self.jobs = self.samples
         self.num_cpus = os.cpu_count()
 
-        # How it works: Say we have 5 samples and 5 iterations.
-        # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
-        #  We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
-
         # Cast to 1 decimal place
         res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
         options = ["", "--resume"]
@@ -87,21 +83,48 @@ def setUp(self):
             f" --platform {self.platform}"
             f" --config {self.config}"
             f" --jobs {self.jobs}"
-            f" --experiment test-resume"
+            f" --experiment {self.experiment_name}"
             f" tune --iterations {self.iterations} --samples {self.samples}"
             f" --resources_per_trial {res_per_trial}"
             f" {c}"
             for c in options
         ]
 
+    def check_trial_times(self, iteration: int = 0) -> str:
+        """
+        Checks the nth iteration time of a trial.
+
+        :param iteration: The iteration to check.
+        :return: The latest modified UNIX time of the nth iteration.
+        """
+        if iteration < 0 or iteration >= self.iterations:
+            raise ValueError("Iteration must be between 0 and iterations - 1")
+
+        experiment_dir = os.path.join(
+            cur_dir,
+            f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune",
+        )
+        folders = glob.glob(os.path.join(experiment_dir, f"variant-*-or-{iteration}"))
+        return max((os.path.getmtime(folder) for folder in folders), default=9e99)
+
     def test_tune_resume(self):
         # Goal is to first run the first config (without resume) and then run the second config (with resume)
         # and check if the run is able to complete.
 
         # Run the first config asynchronously.
         print("Running the first config")
+        latest_modified_time = 0
         with managed_process(self.commands[0], shell=True) as proc:
-            time.sleep(120)
+            time.sleep(30)
+            # Check if first config is complete
+            while True:
+                cur_modified_time = self.check_trial_times()
+                print(f"Current modified time: {cur_modified_time}")
+                print(f"Latest modified time: {latest_modified_time}")
+                if abs(cur_modified_time - latest_modified_time) < 1e-6:
+                    break
+                latest_modified_time = cur_modified_time
+                time.sleep(10)
 
         # Keep trying to stop the ray cluster until it is stopped
         while 1:

From 0dab7e37c54f4fff684fce95c24911d3a3038a08 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Sun, 13 Apr 2025 14:28:51 +0000
Subject: [PATCH 2/6] refactor resume check: rename exec variable and improve
 subprocess error handling

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 tools/AutoTuner/test/resume_check.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index 761315093f..d027973d91 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -76,9 +76,9 @@ def setUp(self):
         # Cast to 1 decimal place
         res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
         options = ["", "--resume"]
-        self.exec = AutoTunerTestUtils.get_exec_cmd()
+        self.executable_command = AutoTunerTestUtils.get_exec_cmd()
         self.commands = [
-            f"{self.exec}"
+            f"{self.executable_command}"
             f" --design {self.design}"
             f" --platform {self.platform}"
             f" --config {self.config}"
@@ -90,12 +90,13 @@ def setUp(self):
             for c in options
         ]
 
-    def check_trial_times(self, iteration: int = 0) -> str:
+    def check_trial_times(self, iteration: int = 0) -> int:
         """
         Checks the nth iteration time of a trial.
 
         :param iteration: The iteration to check.
         :return: The latest modified UNIX time of the nth iteration.
+                 If no folders are found, returns a default value of 9e99.
         """
         if iteration < 0 or iteration >= self.iterations:
             raise ValueError("Iteration must be between 0 and iterations - 1")
@@ -114,24 +115,33 @@ def test_tune_resume(self):
         # Run the first config asynchronously.
         print("Running the first config")
         latest_modified_time = 0
-        with managed_process(self.commands[0], shell=True) as proc:
+        with managed_process(self.commands[0].split()) as proc:
             time.sleep(30)
             # Check if first config is complete
             while True:
                 cur_modified_time = self.check_trial_times()
                 print(f"Current modified time: {cur_modified_time}")
                 print(f"Latest modified time: {latest_modified_time}")
-                if abs(cur_modified_time - latest_modified_time) < 1e-6:
+                if abs(cur_modified_time - latest_modified_time) < 1e-3:
                     break
                 latest_modified_time = cur_modified_time
                 time.sleep(10)
 
         # Keep trying to stop the ray cluster until it is stopped
         while 1:
-            proc = subprocess.run("ray status", shell=True)
+            proc = subprocess.run(
+                "ray status", shell=True, capture_output=True, text=True
+            )
+            if proc.returncode != 0:
+                print(f"Error running 'ray status': {proc.stderr}")
             no_nodes = proc.returncode != 0
-            proc = subprocess.run("ray stop", shell=True)
-            successful = proc.returncode in accepted_rc
+            proc = subprocess.run(
+                "ray stop", shell=True, capture_output=True, text=True
+            )
+            if proc.returncode not in accepted_rc:
+                print(f"Error running 'ray stop': {proc.stderr}")
+                raise RuntimeError("Failed to stop the ray cluster")
+            successful = True
 
             if no_nodes and successful:
                 break

From 30098941e3a030e79203f8a80cbe08e8f8068926 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Mon, 21 Apr 2025 17:25:16 +0000
Subject: [PATCH 3/6] make error clearer

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 tools/AutoTuner/test/resume_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index d027973d91..71ca108bc4 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -99,7 +99,7 @@ def check_trial_times(self, iteration: int = 0) -> int:
                  If no folders are found, returns a default value of 9e99.
         """
         if iteration < 0 or iteration >= self.iterations:
-            raise ValueError("Iteration must be between 0 and iterations - 1")
+            raise ValueError("Iteration must be between 0 and (iterations - 1)")
 
         experiment_dir = os.path.join(
             cur_dir,

From cf5ed3db679e4d1e58b9ea4e0daad670616e66bd Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Mon, 21 Apr 2025 17:28:09 +0000
Subject: [PATCH 4/6] clarify function name

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 tools/AutoTuner/test/resume_check.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index 71ca108bc4..747523b003 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -90,9 +90,9 @@ def setUp(self):
             for c in options
         ]
 
-    def check_trial_times(self, iteration: int = 0) -> int:
+    def get_trial_times(self, iteration: int = 0) -> int:
         """
-        Checks the nth iteration time of a trial.
+        Returns the nth iteration time of a trial.
 
         :param iteration: The iteration to check.
         :return: The latest modified UNIX time of the nth iteration.

From ab31e29324c62d6c62d5bb7c0fb56494b13bba8d Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Fri, 9 May 2025 16:55:33 +0000
Subject: [PATCH 5/6] fix function call

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 tools/AutoTuner/test/resume_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index 747523b003..2823edc2c8 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -119,7 +119,7 @@ def test_tune_resume(self):
             time.sleep(30)
             # Check if first config is complete
             while True:
-                cur_modified_time = self.check_trial_times()
+                cur_modified_time = self.get_trial_times()
                 print(f"Current modified time: {cur_modified_time}")
                 print(f"Latest modified time: {latest_modified_time}")
                 if abs(cur_modified_time - latest_modified_time) < 1e-3:

From e8cdcfe3404bd01ef2fb4da9b9f42e61f5fee541 Mon Sep 17 00:00:00 2001
From: Jack Luar <jluar@precisioninno.com>
Date: Wed, 14 May 2025 16:14:47 +0000
Subject: [PATCH 6/6] revert list comprehension into for loop for better
 readability

Signed-off-by: Jack Luar <jluar@precisioninno.com>
---
 tools/AutoTuner/test/resume_check.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
index 2823edc2c8..21604753d7 100644
--- a/tools/AutoTuner/test/resume_check.py
+++ b/tools/AutoTuner/test/resume_check.py
@@ -42,6 +42,7 @@
 from contextlib import contextmanager
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_MODIFIED_TIME = 0
 
 
 @contextmanager
@@ -90,13 +91,13 @@ def setUp(self):
             for c in options
         ]
 
-    def get_trial_times(self, iteration: int = 0) -> int:
+    def get_last_modified_time(self, iteration: int = 0) -> int:
         """
         Returns the nth iteration time of a trial.
 
         :param iteration: The iteration to check.
         :return: The latest modified UNIX time of the nth iteration.
-                 If no folders are found, returns a default value of 9e99.
+                 If no folders are found, returns a default value.
         """
         if iteration < 0 or iteration >= self.iterations:
             raise ValueError("Iteration must be between 0 and (iterations - 1)")
@@ -105,8 +106,15 @@ def get_trial_times(self, iteration: int = 0) -> int:
             cur_dir,
             f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune",
         )
-        folders = glob.glob(os.path.join(experiment_dir, f"variant-*-or-{iteration}"))
-        return max((os.path.getmtime(folder) for folder in folders), default=9e99)
+        iteration_folders = glob.glob(
+            os.path.join(experiment_dir, f"variant-*-or-{iteration}")
+        )
+        latest_modified_time = DEFAULT_MODIFIED_TIME
+        for folder in iteration_folders:
+            modified_time = os.path.getmtime(folder)
+            if modified_time > latest_modified_time:
+                latest_modified_time = modified_time
+        return latest_modified_time
 
     def test_tune_resume(self):
         # Goal is to first run the first config (without resume) and then run the second config (with resume)
@@ -119,7 +127,7 @@ def test_tune_resume(self):
             time.sleep(30)
             # Check if first config is complete
             while True:
-                cur_modified_time = self.get_trial_times()
+                cur_modified_time = self.get_last_modified_time()
                 print(f"Current modified time: {cur_modified_time}")
                 print(f"Latest modified time: {latest_modified_time}")
                 if abs(cur_modified_time - latest_modified_time) < 1e-3: