|
| 1 | +# Copyright 2023 Lawrence Livermore National Security, LLC and other |
| 2 | +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | + |
| 6 | +import subprocess |
| 7 | +import time |
| 8 | +import sys |
| 9 | +import argparse |
| 10 | +import os |
| 11 | + |
| 12 | +import benchpark.paths |
| 13 | + |
| 14 | +sys.path.append(str(benchpark.paths.benchpark_home) + "/spack/lib/spack") |
| 15 | +from lib.benchpark.accounting import benchpark_experiments # noqa: E402 |
| 16 | + |
| 17 | +DEFAULT_SYSTEM = "llnl-cluster cluster=dane" |
| 18 | +# Skip experiments |
| 19 | +SKIP_EXPR = [ |
| 20 | + # System not enough cores/node |
| 21 | + "gromacs+openmp aws-pcluster instance_type=c6g.xlarge", |
| 22 | + "gromacs+openmp aws-pcluster instance_type=c4.xlarge", |
| 23 | + "gromacs+openmp generic-x86", |
| 24 | + "stream aws-pcluster instance_type=c6g.xlarge", |
| 25 | + "stream aws-pcluster instance_type=c4.xlarge", |
| 26 | + "stream cscs-daint", |
| 27 | + "stream generic-x86", |
| 28 | + # Broken URL's in application.py going to cause dryrun failure |
| 29 | + "genesis", |
| 30 | +] |
| 31 | + |
| 32 | + |
| 33 | +def run_subprocess_cmd(cmd_list, decode=False): |
| 34 | + try: |
| 35 | + result = subprocess.run(cmd_list, capture_output=True, check=True) |
| 36 | + return result.stdout.decode("utf-8") if decode else result |
| 37 | + except subprocess.CalledProcessError as e: |
| 38 | + raise RuntimeError( |
| 39 | + f"Command: {' '.join(cmd_list)}\nOutput: {e.stdout}\nError: {e.stderr}" |
| 40 | + ) |
| 41 | + |
| 42 | + |
| 43 | +def main(): |
| 44 | + parser = argparse.ArgumentParser() |
| 45 | + parser.add_argument( |
| 46 | + "--test", |
| 47 | + choices=[ |
| 48 | + "mpi", |
| 49 | + "cuda", |
| 50 | + "rocm", |
| 51 | + "openmp", |
| 52 | + "strong", |
| 53 | + "weak", |
| 54 | + "throughput", |
| 55 | + "modifiers", |
| 56 | + ], |
| 57 | + help="Only run tests of this type", |
| 58 | + ) |
| 59 | + parser.add_argument( |
| 60 | + "--dryrun", action="store_true", help="Dry runs this script for testing." |
| 61 | + ) |
| 62 | + args = parser.parse_args() |
| 63 | + |
| 64 | + expr_str = run_subprocess_cmd( |
| 65 | + ["./bin/benchpark", "list", "experiments", "--no-title"], decode=True |
| 66 | + ) |
| 67 | + experiments = [e for e in expr_str.replace(" ", "").replace("\t", "").split("\n") if e != ""] |
| 68 | + |
| 69 | + mpi_only_expr = set() |
| 70 | + cuda_expr = [] |
| 71 | + rocm_expr = [] |
| 72 | + openmp_expr = [] |
| 73 | + strong_expr = [] |
| 74 | + weak_expr = [] |
| 75 | + throughput_expr = [] |
| 76 | + |
| 77 | + for e in experiments: |
| 78 | + if "scaling" in e: |
| 79 | + e = e.replace("scaling=", " scaling=") |
| 80 | + if "+strong" in e or "+weak" in e or "+throughput" in e: |
| 81 | + e = e.replace(e, e + "~single_node") |
| 82 | + elif "+" not in e and "=" not in e: |
| 83 | + mpi_only_expr.add(e) |
| 84 | + |
| 85 | + if "cuda" in e: |
| 86 | + cuda_expr.append(e) |
| 87 | + elif "rocm" in e: |
| 88 | + rocm_expr.append(e) |
| 89 | + elif "openmp" in e: |
| 90 | + openmp_expr.append(e) |
| 91 | + elif "strong" in e: |
| 92 | + strong_expr.append(e) |
| 93 | + elif "weak" in e: |
| 94 | + weak_expr.append(e) |
| 95 | + elif "throughput" in e: |
| 96 | + throughput_expr.append(e) |
| 97 | + |
| 98 | + str_dict = {} |
| 99 | + for pmodel in ["mpi", "cuda", "rocm", "openmp"]: |
| 100 | + cmd = ["./bin/benchpark", "list", "systems", "--no-title"] |
| 101 | + if pmodel != "mpi": |
| 102 | + cmd += ["-p", pmodel] |
| 103 | + output = run_subprocess_cmd(cmd, decode=True) |
| 104 | + str_dict[pmodel] = [ |
| 105 | + i |
| 106 | + for i in output.replace(" " * 4, "").replace("\t", "").split("\n") |
| 107 | + if i != "" |
| 108 | + ] |
| 109 | + |
| 110 | + mods_str = run_subprocess_cmd( |
| 111 | + ["./bin/benchpark", "list", "modifiers", "--no-title"], decode=True |
| 112 | + ) |
| 113 | + nmods = [ |
| 114 | + i |
| 115 | + for i in mods_str.replace(" " * 4, "").replace("\t", "").split("\n") |
| 116 | + if i != "" and i not in ["allocation", "caliper"] |
| 117 | + ] |
| 118 | + |
| 119 | + caliper_exp = [ |
| 120 | + e.replace("+caliper", " caliper=time") for e in benchpark_experiments(exclude_variants=[]) if "+caliper" in e and e.split("+")[0] in mpi_only_expr |
| 121 | + ] |
| 122 | + modifiers_expr = caliper_exp + [e + " " + m + "=on" for e in mpi_only_expr for m in nmods] |
| 123 | + |
| 124 | + exprs_to_sys = [ |
| 125 | + ("mpi", mpi_only_expr, str_dict["mpi"]), |
| 126 | + ("cuda", cuda_expr, str_dict["cuda"]), |
| 127 | + ("rocm", rocm_expr, str_dict["rocm"]), |
| 128 | + ("openmp", openmp_expr, str_dict["openmp"]), |
| 129 | + ("strong", strong_expr, str_dict["mpi"]), |
| 130 | + ("weak", weak_expr, str_dict["mpi"]), |
| 131 | + ("throughput", throughput_expr, str_dict["mpi"]), |
| 132 | + ("modifiers", modifiers_expr, [DEFAULT_SYSTEM]), |
| 133 | + ] |
| 134 | + |
| 135 | + if args.test: |
| 136 | + exprs_to_sys = [tup for tup in exprs_to_sys if tup[0] == args.test] |
| 137 | + |
| 138 | + total_tests = sum( |
| 139 | + len(expr_spec_list) * len(sys_spec_list) |
| 140 | + for _, expr_spec_list, sys_spec_list in exprs_to_sys |
| 141 | + ) |
| 142 | + print(f"Total tests to run: {total_tests}") |
| 143 | + |
| 144 | + start = time.time() |
| 145 | + errors = {} |
| 146 | + fail_tests = 0 |
| 147 | + ran_tests = 0 |
| 148 | + skip_tests = 0 |
| 149 | + for _, expr_spec_list, sys_spec_list in exprs_to_sys: |
| 150 | + for espec in expr_spec_list: |
| 151 | + for sspec in sys_spec_list: |
| 152 | + expr = f"{espec} {sspec}" |
| 153 | + if expr in SKIP_EXPR: |
| 154 | + skip_tests += 1 |
| 155 | + print(f'Skipping "{expr}"') |
| 156 | + continue |
| 157 | + ran_tests += 1 |
| 158 | + print(f'Running "{expr}"') |
| 159 | + if args.dryrun: |
| 160 | + continue |
| 161 | + try: |
| 162 | + subprocess.run( |
| 163 | + ["bash", ".github/utils/dryrun.sh", espec, sspec], |
| 164 | + env={**os.environ}, |
| 165 | + capture_output=True, |
| 166 | + check=True |
| 167 | + ) |
| 168 | + except subprocess.CalledProcessError as e: |
| 169 | + errors[f"{espec} {sspec}"] = e.stderr.decode() |
| 170 | + fail_tests += 1 |
| 171 | + end = time.time() |
| 172 | + |
| 173 | + for i, (key, value) in enumerate(errors.items()): |
| 174 | + print("=" * 100) |
| 175 | + print(str(i + 1) + ". " + key) |
| 176 | + print(value) |
| 177 | + |
| 178 | + print(f"Elapsed: {(end - start) / 60:.2f} minutes") |
| 179 | + print(f"{ran_tests - fail_tests} Passing. {fail_tests} Failing. {skip_tests} Skipped.") |
| 180 | + |
| 181 | + sys.exit(1 if fail_tests > 0 else 0) |
| 182 | + |
| 183 | + |
| 184 | +if __name__ == "__main__": |
| 185 | + main() |
0 commit comments