Skip to content

Commit 8e18ff7

Browse files
committed
refactor resume check: rename exec variable and improve subprocess error handling
Signed-off-by: Jack Luar <jluar@precisioninno.com>
1 parent 20a74f2 commit 8e18ff7

File tree

1 file changed

+19
-9
lines changed

1 file changed

+19
-9
lines changed

tools/AutoTuner/test/resume_check.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ def setUp(self):
7676
# Cast to 1 decimal place
7777
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
7878
options = ["", "--resume"]
79-
self.exec = AutoTunerTestUtils.get_exec_cmd()
79+
self.executable_command = AutoTunerTestUtils.get_exec_cmd()
8080
self.commands = [
81-
f"{self.exec}"
81+
f"{self.executable_command}"
8282
f" --design {self.design}"
8383
f" --platform {self.platform}"
8484
f" --config {self.config}"
@@ -90,12 +90,13 @@ def setUp(self):
9090
for c in options
9191
]
9292

93-
def check_trial_times(self, iteration: int = 0) -> str:
93+
def check_trial_times(self, iteration: int = 0) -> int:
9494
"""
9595
Checks the nth iteration time of a trial.
9696
9797
:param iteration: The iteration to check.
9898
:return: The latest modified UNIX time of the nth iteration.
99+
If no folders are found, returns a default value of 9e99.
99100
"""
100101
if iteration < 0 or iteration >= self.iterations:
101102
raise ValueError("Iteration must be between 0 and iterations - 1")
@@ -114,32 +115,41 @@ def test_tune_resume(self):
114115
# Run the first config asynchronously.
115116
print("Running the first config")
116117
latest_modified_time = 0
117-
with managed_process(self.commands[0], shell=True) as proc:
118+
with managed_process(self.commands[0].split()) as proc:
118119
time.sleep(30)
119120
# Check if first config is complete
120121
while True:
121122
cur_modified_time = self.check_trial_times()
122123
print(f"Current modified time: {cur_modified_time}")
123124
print(f"Latest modified time: {latest_modified_time}")
124-
if abs(cur_modified_time - latest_modified_time) < 1e-6:
125+
if abs(cur_modified_time - latest_modified_time) < 1e-3:
125126
break
126127
latest_modified_time = cur_modified_time
127128
time.sleep(10)
128129

129130
# Keep trying to stop the ray cluster until it is stopped
130131
while 1:
131-
proc = subprocess.run("ray status", shell=True)
132+
proc = subprocess.run(
133+
"ray status", shell=True, capture_output=True, text=True
134+
)
135+
if proc.returncode != 0:
136+
print(f"Error running 'ray status': {proc.stderr}")
132137
no_nodes = proc.returncode != 0
133-
proc = subprocess.run("ray stop", shell=True)
134-
successful = proc.returncode == 0
138+
proc = subprocess.run(
139+
"ray stop", shell=True, capture_output=True, text=True
140+
)
141+
if proc.returncode != 0:
142+
print(f"Error running 'ray stop': {proc.stderr}")
143+
raise RuntimeError("Failed to stop the ray cluster")
144+
successful = True
135145

136146
if no_nodes and successful:
137147
break
138148
time.sleep(10)
139149

140150
# Run the second config to completion
141151
print("Running the second config")
142-
proc = subprocess.run(self.commands[1], shell=True)
152+
proc = subprocess.run(self.commands[1], shell=True, check=True)
143153
successful = proc.returncode == 0
144154
self.assertTrue(successful)
145155

0 commit comments

Comments
 (0)