Skip to content

[AutoTuner] Restore resume check #3070

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions flow/test/test_autotuner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ if [ "$PLATFORM_WITHOUT_DASHES" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; the
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files

echo "Running AutoTuner resume test (only once)"
# Temporarily disable resume check test due to flakiness
#python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume

echo "Running AutoTuner binary check (only once)"
openroad_autotuner -h
Expand Down
73 changes: 57 additions & 16 deletions tools/AutoTuner/test/resume_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
## POSSIBILITY OF SUCH DAMAGE.
###############################################################################

import glob
import unittest
import subprocess
import os
Expand All @@ -41,9 +42,7 @@
from contextlib import contextmanager

cur_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(cur_dir, "../src")
orfs_dir = os.path.join(cur_dir, "../../../flow")
os.chdir(src_dir)
DEFAULT_MODIFIED_TIME = 0


@contextmanager
Expand All @@ -65,50 +64,92 @@ class ResumeCheck(unittest.TestCase):
design = "gcd"
samples = 5
iterations = 2
experiment_name = "test-resume"

def setUp(self):
self.config = os.path.join(
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
cur_dir,
f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json",
)
self.jobs = self.samples
self.num_cpus = os.cpu_count()

# How it works: Say we have 5 samples and 5 iterations.
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)

# Cast to 1 decimal place
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
options = ["", "--resume"]
self.exec = AutoTunerTestUtils.get_exec_cmd()
self.executable_command = AutoTunerTestUtils.get_exec_cmd()
self.commands = [
f"{self.exec}"
f"{self.executable_command}"
f" --design {self.design}"
f" --platform {self.platform}"
f" --config {self.config}"
f" --jobs {self.jobs}"
f" --experiment test-resume"
f" --experiment {self.experiment_name}"
f" tune --iterations {self.iterations} --samples {self.samples}"
f" --resources_per_trial {res_per_trial}"
f" {c}"
for c in options
]

def get_last_modified_time(self, iteration: int = 0) -> int:
"""
Returns the nth iteration time of a trial.

:param iteration: The iteration to check.
:return: The latest modified UNIX time of the nth iteration.
If no folders are found, returns a default value.
"""
if iteration < 0 or iteration >= self.iterations:
raise ValueError("Iteration must be between 0 and (iterations - 1)")

experiment_dir = os.path.join(
cur_dir,
f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune",
)
iteration_folders = glob.glob(
os.path.join(experiment_dir, f"variant-*-or-{iteration}")
)
latest_modified_time = DEFAULT_MODIFIED_TIME
for folder in iteration_folders:
modified_time = os.path.getmtime(folder)
if modified_time > latest_modified_time:
latest_modified_time = modified_time
return latest_modified_time

def test_tune_resume(self):
# Goal is to first run the first config (without resume) and then run the second config (with resume)
# and check if the run is able to complete.

# Run the first config asynchronously.
print("Running the first config")
with managed_process(self.commands[0], shell=True) as proc:
time.sleep(120)
latest_modified_time = 0
with managed_process(self.commands[0].split()) as proc:
time.sleep(30)
# Check if first config is complete
while True:
cur_modified_time = self.get_last_modified_time()
print(f"Current modified time: {cur_modified_time}")
print(f"Latest modified time: {latest_modified_time}")
if abs(cur_modified_time - latest_modified_time) < 1e-3:
break
latest_modified_time = cur_modified_time
time.sleep(10)

# Keep trying to stop the ray cluster until it is stopped
while 1:
proc = subprocess.run("ray status", shell=True)
proc = subprocess.run(
"ray status", shell=True, capture_output=True, text=True
)
if proc.returncode != 0:
print(f"Error running 'ray status': {proc.stderr}")
no_nodes = proc.returncode != 0
proc = subprocess.run("ray stop", shell=True)
successful = proc.returncode in accepted_rc
proc = subprocess.run(
"ray stop", shell=True, capture_output=True, text=True
)
if proc.returncode not in accepted_rc:
print(f"Error running 'ray stop': {proc.stderr}")
raise RuntimeError("Failed to stop the ray cluster")
successful = True

if no_nodes and successful:
break
Expand Down