Skip to content

Commit 20a74f2

Browse files
committed
restore resume check using last modified filetime
Signed-off-by: Jack Luar <jluar@precisioninno.com>
1 parent 4e30669 commit 20a74f2

File tree

2 files changed

+34
-12
lines changed

2 files changed

+34
-12
lines changed

flow/test/test_autotuner.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ if [ "$PLATFORM_WITHOUT_DASHES" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; the
3030
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
3131

3232
echo "Running AutoTuner resume test (only once)"
33-
# Temporarily disable resume check test due to flakiness
34-
#python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
33+
python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
3534

3635
echo "Running AutoTuner binary check (only once)"
3736
openroad_autotuner -h

tools/AutoTuner/test/resume_check.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
## POSSIBILITY OF SUCH DAMAGE.
3333
###############################################################################
3434

35+
import glob
3536
import unittest
3637
import subprocess
3738
import os
@@ -41,9 +42,6 @@
4142
from contextlib import contextmanager
4243

4344
cur_dir = os.path.dirname(os.path.abspath(__file__))
44-
src_dir = os.path.join(cur_dir, "../src")
45-
orfs_dir = os.path.join(cur_dir, "../../../flow")
46-
os.chdir(src_dir)
4745

4846

4947
@contextmanager
@@ -65,18 +63,16 @@ class ResumeCheck(unittest.TestCase):
6563
design = "gcd"
6664
samples = 5
6765
iterations = 2
66+
experiment_name = "test-resume"
6867

6968
def setUp(self):
7069
self.config = os.path.join(
71-
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
70+
cur_dir,
71+
f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json",
7272
)
7373
self.jobs = self.samples
7474
self.num_cpus = os.cpu_count()
7575

76-
# How it works: Say we have 5 samples and 5 iterations.
77-
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
78-
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
79-
8076
# Cast to 1 decimal place
8177
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
8278
options = ["", "--resume"]
@@ -87,21 +83,48 @@ def setUp(self):
8783
f" --platform {self.platform}"
8884
f" --config {self.config}"
8985
f" --jobs {self.jobs}"
90-
f" --experiment test-resume"
86+
f" --experiment {self.experiment_name}"
9187
f" tune --iterations {self.iterations} --samples {self.samples}"
9288
f" --resources_per_trial {res_per_trial}"
9389
f" {c}"
9490
for c in options
9591
]
9692

93+
def check_trial_times(self, iteration: int = 0) -> str:
94+
"""
95+
Checks the nth iteration time of a trial.
96+
97+
:param iteration: The iteration to check.
98+
:return: The latest modified UNIX time of the nth iteration.
99+
"""
100+
if iteration < 0 or iteration >= self.iterations:
101+
raise ValueError("Iteration must be between 0 and iterations - 1")
102+
103+
experiment_dir = os.path.join(
104+
cur_dir,
105+
f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune",
106+
)
107+
folders = glob.glob(os.path.join(experiment_dir, f"variant-*-or-{iteration}"))
108+
return max((os.path.getmtime(folder) for folder in folders), default=9e99)
109+
97110
def test_tune_resume(self):
98111
# Goal is to first run the first config (without resume) and then run the second config (with resume)
99112
# and check if the run is able to complete.
100113

101114
# Run the first config asynchronously.
102115
print("Running the first config")
116+
latest_modified_time = 0
103117
with managed_process(self.commands[0], shell=True) as proc:
104-
time.sleep(120)
118+
time.sleep(30)
119+
# Check if first config is complete
120+
while True:
121+
cur_modified_time = self.check_trial_times()
122+
print(f"Current modified time: {cur_modified_time}")
123+
print(f"Latest modified time: {latest_modified_time}")
124+
if abs(cur_modified_time - latest_modified_time) < 1e-6:
125+
break
126+
latest_modified_time = cur_modified_time
127+
time.sleep(10)
105128

106129
# Keep trying to stop the ray cluster until it is stopped
107130
while 1:

0 commit comments

Comments
 (0)