@@ -76,9 +76,9 @@ def setUp(self):
76
76
# Cast to 1 decimal place
77
77
res_per_trial = float ("{:.1f}" .format (self .num_cpus / self .samples ))
78
78
options = ["" , "--resume" ]
79
- self .exec = AutoTunerTestUtils .get_exec_cmd ()
79
+ self .executable_command = AutoTunerTestUtils .get_exec_cmd ()
80
80
self .commands = [
81
- f"{ self .exec } "
81
+ f"{ self .executable_command } "
82
82
f" --design { self .design } "
83
83
f" --platform { self .platform } "
84
84
f" --config { self .config } "
@@ -90,12 +90,13 @@ def setUp(self):
90
90
for c in options
91
91
]
92
92
93
- def check_trial_times (self , iteration : int = 0 ) -> str :
93
+ def check_trial_times (self , iteration : int = 0 ) -> int :
94
94
"""
95
95
Checks the nth iteration time of a trial.
96
96
97
97
:param iteration: The iteration to check.
98
98
:return: The latest modified UNIX time of the nth iteration.
99
+ If no folders are found, returns a default value of 9e99.
99
100
"""
100
101
if iteration < 0 or iteration >= self .iterations :
101
102
raise ValueError ("Iteration must be between 0 and iterations - 1" )
@@ -114,32 +115,41 @@ def test_tune_resume(self):
114
115
# Run the first config asynchronously.
115
116
print ("Running the first config" )
116
117
latest_modified_time = 0
117
- with managed_process (self .commands [0 ], shell = True ) as proc :
118
+ with managed_process (self .commands [0 ]. split () ) as proc :
118
119
time .sleep (30 )
119
120
# Check if first config is complete
120
121
while True :
121
122
cur_modified_time = self .check_trial_times ()
122
123
print (f"Current modified time: { cur_modified_time } " )
123
124
print (f"Latest modified time: { latest_modified_time } " )
124
- if abs (cur_modified_time - latest_modified_time ) < 1e-6 :
125
+ if abs (cur_modified_time - latest_modified_time ) < 1e-3 :
125
126
break
126
127
latest_modified_time = cur_modified_time
127
128
time .sleep (10 )
128
129
129
130
# Keep trying to stop the ray cluster until it is stopped
130
131
while 1 :
131
- proc = subprocess .run ("ray status" , shell = True )
132
+ proc = subprocess .run (
133
+ "ray status" , shell = True , capture_output = True , text = True
134
+ )
135
+ if proc .returncode != 0 :
136
+ print (f"Error running 'ray status': { proc .stderr } " )
132
137
no_nodes = proc .returncode != 0
133
- proc = subprocess .run ("ray stop" , shell = True )
134
- successful = proc .returncode == 0
138
+ proc = subprocess .run (
139
+ "ray stop" , shell = True , capture_output = True , text = True
140
+ )
141
+ if proc .returncode != 0 :
142
+ print (f"Error running 'ray stop': { proc .stderr } " )
143
+ raise RuntimeError ("Failed to stop the ray cluster" )
144
+ successful = True
135
145
136
146
if no_nodes and successful :
137
147
break
138
148
time .sleep (10 )
139
149
140
150
# Run the second config to completion
141
151
print ("Running the second config" )
142
- proc = subprocess .run (self .commands [1 ], shell = True )
152
+ proc = subprocess .run (self .commands [1 ], shell = True , check = True )
143
153
successful = proc .returncode == 0
144
154
self .assertTrue (successful )
145
155
0 commit comments