From ed372eeec01de5eff58f29059d7a115ed119d1fe Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Thu, 11 Sep 2025 11:47:34 -0700 Subject: [PATCH 1/9] Enable undersubscription study --- experiments/raja-perf/experiment.py | 4 +- lib/benchpark/cmd/analyze.py | 103 ++++++++++++++++++++-------- modifiers/allocation/modifier.py | 2 + repo/raja-perf/application.py | 2 +- 4 files changed, 79 insertions(+), 32 deletions(-) diff --git a/experiments/raja-perf/experiment.py b/experiments/raja-perf/experiment.py index eb822d6b1..81340e903 100644 --- a/experiments/raja-perf/experiment.py +++ b/experiments/raja-perf/experiment.py @@ -40,7 +40,9 @@ class RajaPerf( def compute_applications_section(self): if self.spec.satisfies("exec_mode=test"): # Per-process size - self.add_experiment_variable("process_problem_size", 1048576, True) + #self.add_experiment_variable("process_problem_size", 134217728, True) #dane + #self.add_experiment_variable("process_problem_size", 67108864, True) #lassen + self.add_experiment_variable("process_problem_size", 268435456, True) # tuo # Number of processes self.add_experiment_variable("n_resources", 1, False) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index e2c5e3b9b..1098fd706 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -18,6 +18,7 @@ import matplotlib as mpl import pandas as pd import thicket as th +import seaborn # ----------------------------- # Constants @@ -212,42 +213,62 @@ def make_stacked_line_chart(**kwargs): if kwargs.get("chart_fontsize"): mpl.rcParams.update({"font.size": kwargs.get("chart_fontsize")}) + mapping = { + "lassen": f"ats-2 {kwargs.get('cluster_to_ps')['lassen']}", + "dane": f"cts-2 {kwargs.get('cluster_to_ps')['dane']}", + "tuolumne": f"ats-4 {kwargs.get('cluster_to_ps')['tuolumne']}", + } + + tcol = tdf.columns[0] + tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping) + tdf["profile"] = tdf.index.map(lambda x: str(x[:-1])) + tdf = tdf.reset_index(drop=True) + xlabel = kwargs.get("chart_xlabel") if isinstance(xlabel, list): xlabel = ", ".join(NAME_REMAP[x] for x in xlabel) else: if xlabel in NAME_REMAP: xlabel = NAME_REMAP[xlabel] - fig, ax = plt.subplots() - tdf.plot( - kind="area", - title=kwargs.get("chart_title", ""), - xlabel=xlabel, - ylabel=y_label, - figsize=kwargs["chart_figsize"] if kwargs["chart_figsize"] else (12, 7), + fig, ax = plt.subplots(figsize=kwargs.get("chart_figsize", (12, 7))) + kind = kwargs.get("chart_kind", "line") + ax.set_title(kwargs.get("chart_title", "")) + ax.set_xlabel(xlabel) + ax.set_ylabel(y_label) + ax.legend(title="System") + plot_args = dict( + data=tdf, + # kind=kind, ax=ax, + hue=("cluster", ""), + x=("profile", ""), + y=tcol, ) + # Add marker only if line plot + if kind == "line": + plot_args["marker"] = "o" + seaborn.lineplot(**plot_args) y_axis_limits = kwargs.get("chart_yaxis_limits") if y_axis_limits is not None: ax.set_ylim(y_axis_limits[0], y_axis_limits[1]) - handles, labels = ax.get_legend_handles_labels() - handles = list(reversed(handles)) - labels = list(reversed(labels)) - calls_list = list(reversed(calls_list)) - for i, label in enumerate(labels): - obj = calls_list[i][0] - name = obj if isinstance(obj, str) else obj[0].frame["name"] - if name not in label: - raise ValueError(f"Name '{name}' is not in label '{label}'") - labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")" - ax.legend( - handles, - labels, - bbox_to_anchor=(1, 0.5), - loc="center left", - title="Region (Calls/rank (max))", - ) + # handles, labels = ax.get_legend_handles_labels() + # handles = list(reversed(handles)) + # labels = list(reversed(labels)) + # calls_list = list(reversed(calls_list)) + # for i, label in enumerate(labels): + # obj = calls_list[i][0] + # name = obj if isinstance(obj, str) else obj[0].frame["name"] + # if name not in label: + # raise ValueError(f"Name '{name}' is not in label '{label}'") + # labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")" + # ax.legend( + # handles, + # labels, + # bbox_to_anchor=(1, 0.5), + # loc="center left", + # title="Region (Calls/rank (max))", + # ) fig.autofmt_xdate() plt.tight_layout() @@ -281,6 +302,8 @@ def prepare_data(**kwargs): ) tk.update_inclusive_columns() + cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"])) + clean_tree = tk.tree(kwargs["tree_metric"], render_header=True) clean_tree = re.compile(r"\x1b\[([0-9;]*m)").sub("", clean_tree) @@ -300,6 +323,15 @@ def prepare_data(**kwargs): # Remove singular roots if inclusive metric metric = kwargs["yaxis_metric"] + + tk.dataframe["Bandwidth (GB/s)"] = ( + tk.dataframe["Bytes/Rep"] + / tk.dataframe["Avg time/rank (exc)"] + / 10**9 + * tk.dataframe["Reps"] + * tk.metadata["mpi.world.size"] + ) + if metric in tk.inc_metrics and len(tk.graph.roots) == 1: root_name = tk.graph.roots[0].frame["name"] logger.info( @@ -366,7 +398,7 @@ def prepare_data(**kwargs): tk.dataframe = pd.concat([tk.dataframe.filter(like=p, axis=0) for p in prefix]) # Group by varied parameters - grouped = tk.groupby(x_axis_metadata) + grouped = tk.groupby(x_axis_metadata + ["cluster"]) ctk = th.Thicket.concat_thickets( list(grouped.values()), headers=list(grouped.keys()), axis="columns" ) @@ -374,7 +406,8 @@ def prepare_data(**kwargs): cluster_col = "cluster" if "cluster" in tk.metadata.columns else "host.cluster" # Check these values are constant app = validate_single_metadata_value("application_name", tk) - cluster = validate_single_metadata_value(cluster_col, tk) + # cluster = validate_single_metadata_value(cluster_col, tk) + cluster = "multiple" version = validate_single_metadata_value("version", tk) # Find programming model from spec @@ -389,12 +422,14 @@ def prepare_data(**kwargs): "weak": ["process_problem_size"], "throughput": ["n_resources", "n_nodes"], }[scaling] - constant_str = ", ".join( - f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}" for key in constant_keys - ) + # constant_str = ", ".join( + # f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}" for key in constant_keys + # ) + constant_str = "" # Check constant for key in constant_keys: - validate_single_metadata_value(key, tk) + # validate_single_metadata_value(key, tk) + pass if not kwargs.get("chart_title"): kwargs["chart_title"] = ( @@ -447,6 +482,7 @@ def prepare_data(**kwargs): raise ValueError( f"Expected one scaling factor, found: {list(scaling_factors)}" ) + kwargs["cluster_to_ps"] = cluster_to_ps make_stacked_line_chart(df=ctk.dataframe, x_axis=list(grouped.keys()), **kwargs) @@ -587,6 +623,13 @@ def setup_parser(root_parser): default=None, help="With 'archive', path for the .tar.gz (defaults to CWD/-.tar.gz)", ) + root_parser.add_argument( + "--chart-kind", + type=str, + default="area", + choices=["area", "line", "bar", "scatter"], + help="Type of chart to generate", + ) def command(args): diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index 7f0621f34..0fe1ed761 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -442,6 +442,8 @@ def lsf_instructions(self, v): batch_opts.append(f"-q {v.queue}") if v.timeout: batch_opts.append(f"-W {TimeFormat.as_hhmm(v.timeout)}") + if v.bank: + batch_opts.append(f"-P {v.bank}") batch_directives = list(f"#BSUB {x}" for x in batch_opts) diff --git a/repo/raja-perf/application.py b/repo/raja-perf/application.py index 82ff30165..c162da3ca 100644 --- a/repo/raja-perf/application.py +++ b/repo/raja-perf/application.py @@ -18,7 +18,7 @@ class RajaPerf(ExecutableApplication): 'mpi','network-point-to-point','network-latency-bound', 'c++','raja','sycl','builtin-caliper'] - executable('run', 'raja-perf.exe --size {process_problem_size} -atsc ${CALI_CONFIG_MODE} -atcc ${OTHER_CALI_CONFIG}', use_mpi=True) + executable('run', 'raja-perf.exe --size {process_problem_size} -atsc ${CALI_CONFIG_MODE} -atcc ${OTHER_CALI_CONFIG} -v Base_Seq -t default -ek Apps_FEMSWEEP', use_mpi=True) workload('suite', executables=['run']) From 5af7d2f19e994afbf95f493e08a6c18aca9b89d9 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 11 Sep 2025 14:39:06 -0700 Subject: [PATCH 2/9] rzgenie --- experiments/raja-perf/experiment.py | 3 ++- .../hardware_description.yaml | 26 +++++++++++++++++++ systems/llnl-cluster/system.py | 9 ++++++- 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 systems/all_hardware_descriptions/Penguin-haswell-OmniPath/hardware_description.yaml diff --git a/experiments/raja-perf/experiment.py b/experiments/raja-perf/experiment.py index 81340e903..3403bcfd0 100644 --- a/experiments/raja-perf/experiment.py +++ b/experiments/raja-perf/experiment.py @@ -42,7 +42,8 @@ def compute_applications_section(self): # Per-process size #self.add_experiment_variable("process_problem_size", 134217728, True) #dane #self.add_experiment_variable("process_problem_size", 67108864, True) #lassen - self.add_experiment_variable("process_problem_size", 268435456, True) # tuo + #self.add_experiment_variable("process_problem_size", 268435456, True) # tuo + self.add_experiment_variable("process_problem_size", 32*1024*1024, True) # rzgenie # Number of processes self.add_experiment_variable("n_resources", 1, False) diff --git a/systems/all_hardware_descriptions/Penguin-haswell-OmniPath/hardware_description.yaml b/systems/all_hardware_descriptions/Penguin-haswell-OmniPath/hardware_description.yaml new file mode 100644 index 000000000..75ef585e9 --- /dev/null +++ b/systems/all_hardware_descriptions/Penguin-haswell-OmniPath/hardware_description.yaml @@ -0,0 +1,26 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 +system_definition: + name: Penguin-haswell-OmniPath + integrator: + vendor: Penguin + name: + processor: + vendor: Intel + name: Xeon-E5-2695v4 + ISA: x86_64 + uArch: haswell + interconnect: + vendor: CornelisNetworks + name: OmniPath + systems-tested: + llnl-rzgenie: + os: TOSS + scheduler: slurm + compiler: gcc + runtime: + mpi: mvapich + installation-year: 2017 + top500-system-instances: diff --git a/systems/llnl-cluster/system.py b/systems/llnl-cluster/system.py index 3f85ad6da..ea3aea75f 100644 --- a/systems/llnl-cluster/system.py +++ b/systems/llnl-cluster/system.py @@ -51,12 +51,19 @@ class LlnlCluster(System): + "/DELL-sapphirerapids-OmniPath/hardware_description.yaml", "queues": [JobQueue("pdebug", 60, 20), JobQueue("pbatch", 1440, 520)], }, + "rzgenie": { + "sys_cores_per_node": 36, + "system_site": "llnl", + "hardware_key": str(hardware_descriptions) + + "/Penguin-haswell-OmniPath/hardware_description.yaml", + "queues": [JobQueue("pdebug", 720, 43)], + } } variant( "cluster", default="dane", - values=("ruby", "magma", "dane"), + values=("ruby", "magma", "dane", "rzgenie"), description="Which cluster to run on", ) From ec3b8852484b7b3a2a0e924211014f429b21b270 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Thu, 11 Sep 2025 15:38:20 -0700 Subject: [PATCH 3/9] Add poodle --- experiments/raja-perf/experiment.py | 3 ++- lib/benchpark/cmd/analyze.py | 7 +++++++ repo/raja-perf/application.py | 2 +- systems/llnl-cluster/system.py | 13 ++++++++++--- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/experiments/raja-perf/experiment.py b/experiments/raja-perf/experiment.py index 3403bcfd0..d2fa8df6e 100644 --- a/experiments/raja-perf/experiment.py +++ b/experiments/raja-perf/experiment.py @@ -43,7 +43,8 @@ def compute_applications_section(self): #self.add_experiment_variable("process_problem_size", 134217728, True) #dane #self.add_experiment_variable("process_problem_size", 67108864, True) #lassen #self.add_experiment_variable("process_problem_size", 268435456, True) # tuo - self.add_experiment_variable("process_problem_size", 32*1024*1024, True) # rzgenie + #self.add_experiment_variable("process_problem_size", 32*1024*1024, True) # rzgenie + self.add_experiment_variable("process_problem_size", 128*1024*1024, True) # poodle # Number of processes self.add_experiment_variable("n_resources", 1, False) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index 1098fd706..289f88ac2 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -217,6 +217,7 @@ def make_stacked_line_chart(**kwargs): "lassen": f"ats-2 {kwargs.get('cluster_to_ps')['lassen']}", "dane": f"cts-2 {kwargs.get('cluster_to_ps')['dane']}", "tuolumne": f"ats-4 {kwargs.get('cluster_to_ps')['tuolumne']}", + "rzgenie": f"cts-1 {kwargs.get('cluster_to_ps')['rzgenie']}", } tcol = tdf.columns[0] @@ -236,6 +237,8 @@ def make_stacked_line_chart(**kwargs): ax.set_xlabel(xlabel) ax.set_ylabel(y_label) ax.legend(title="System") + # plt.xscale("log", base=2) + plt.yscale("log", base=2) plot_args = dict( data=tdf, # kind=kind, @@ -252,6 +255,10 @@ def make_stacked_line_chart(**kwargs): if y_axis_limits is not None: ax.set_ylim(y_axis_limits[0], y_axis_limits[1]) + from matplotlib.ticker import ScalarFormatter + plt.gca().yaxis.set_major_formatter(ScalarFormatter()) + plt.gca().ticklabel_format(style="plain", axis="y") + # handles, labels = ax.get_legend_handles_labels() # handles = list(reversed(handles)) # labels = list(reversed(labels)) diff --git a/repo/raja-perf/application.py b/repo/raja-perf/application.py index c162da3ca..4ad9b8435 100644 --- a/repo/raja-perf/application.py +++ b/repo/raja-perf/application.py @@ -18,7 +18,7 @@ class RajaPerf(ExecutableApplication): 'mpi','network-point-to-point','network-latency-bound', 'c++','raja','sycl','builtin-caliper'] - executable('run', 'raja-perf.exe --size {process_problem_size} -atsc ${CALI_CONFIG_MODE} -atcc ${OTHER_CALI_CONFIG} -v Base_Seq -t default -ek Apps_FEMSWEEP', use_mpi=True) + executable('run', 'raja-perf.exe --size {process_problem_size} -atsc ${CALI_CONFIG_MODE} -atcc ${OTHER_CALI_CONFIG} -v Base_Seq -t default -k Stream_TRIAD -ek Apps_FEMSWEEP', use_mpi=True) workload('suite', executables=['run']) diff --git a/systems/llnl-cluster/system.py b/systems/llnl-cluster/system.py index ea3aea75f..2323cde96 100644 --- a/systems/llnl-cluster/system.py +++ b/systems/llnl-cluster/system.py @@ -57,13 +57,20 @@ class LlnlCluster(System): "hardware_key": str(hardware_descriptions) + "/Penguin-haswell-OmniPath/hardware_description.yaml", "queues": [JobQueue("pdebug", 720, 43)], - } + }, + "poodle": { + "sys_cores_per_node": 112, + "system_site": "llnl", + "hardware_key": str(hardware_descriptions) + + "/DELL-sapphirerapids-OmniPath/hardware_description.yaml", + "queues": [JobQueue("pdebug", 30, 3), JobQueue("pbatch", 12000, 29), JobQueue("phighmem", 12000, 4)], + }, } variant( "cluster", default="dane", - values=("ruby", "magma", "dane", "rzgenie"), + values=("ruby", "magma", "dane", "rzgenie", "poodle"), description="Which cluster to run on", ) @@ -85,7 +92,7 @@ class LlnlCluster(System): variant( "queue", default="none", - values=("none", "pbatch", "pdebug"), + values=("none", "pbatch", "pdebug", "phighmem"), multi=False, description="Submit to queue other than the default queue (e.g. pdebug)", ) From 9ae9c80bb58774442126b1c43327163058b42995 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 12 Sep 2025 09:07:11 -0700 Subject: [PATCH 4/9] Add poodle plot --- lib/benchpark/cmd/analyze.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index 289f88ac2..1fb70bc33 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -218,11 +218,12 @@ def make_stacked_line_chart(**kwargs): "dane": f"cts-2 {kwargs.get('cluster_to_ps')['dane']}", "tuolumne": f"ats-4 {kwargs.get('cluster_to_ps')['tuolumne']}", "rzgenie": f"cts-1 {kwargs.get('cluster_to_ps')['rzgenie']}", + "poodle": f"ats-3 {kwargs.get('cluster_to_ps')['poodle']}" } tcol = tdf.columns[0] tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping) - tdf["profile"] = tdf.index.map(lambda x: str(x[:-1])) + tdf["profile"] = tdf.index.map(lambda x: int(x[1])) tdf = tdf.reset_index(drop=True) xlabel = kwargs.get("chart_xlabel") @@ -237,7 +238,7 @@ def make_stacked_line_chart(**kwargs): ax.set_xlabel(xlabel) ax.set_ylabel(y_label) ax.legend(title="System") - # plt.xscale("log", base=2) + plt.xscale("log", base=2) plt.yscale("log", base=2) plot_args = dict( data=tdf, @@ -254,10 +255,12 @@ def make_stacked_line_chart(**kwargs): y_axis_limits = kwargs.get("chart_yaxis_limits") if y_axis_limits is not None: ax.set_ylim(y_axis_limits[0], y_axis_limits[1]) + ax.set_xlim(1, 128) from matplotlib.ticker import ScalarFormatter plt.gca().yaxis.set_major_formatter(ScalarFormatter()) - plt.gca().ticklabel_format(style="plain", axis="y") + plt.gca().xaxis.set_major_formatter(ScalarFormatter()) + plt.gca().ticklabel_format(style="plain") # handles, labels = ax.get_legend_handles_labels() # handles = list(reversed(handles)) From 95eebab5392bb06b5234a7d30d014ac0f34e4d4f Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 12 Sep 2025 12:49:36 -0700 Subject: [PATCH 5/9] Grid --- lib/benchpark/cmd/analyze.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index 1fb70bc33..49f011332 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -239,7 +239,8 @@ def make_stacked_line_chart(**kwargs): ax.set_ylabel(y_label) ax.legend(title="System") plt.xscale("log", base=2) - plt.yscale("log", base=2) + plt.yscale("log", base=2) + plt.grid(True) plot_args = dict( data=tdf, # kind=kind, From 24b70ed9896bdde9a2a9ee947362d48b54db4d7a Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 12 Sep 2025 13:06:24 -0700 Subject: [PATCH 6/9] Latest command and fig --- lib/benchpark/cmd/analyze.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index 49f011332..b2de138b8 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -11,6 +11,7 @@ import tarfile import shutil import warnings +from tqdm import tqdm from glob import glob from datetime import datetime @@ -237,7 +238,6 @@ def make_stacked_line_chart(**kwargs): ax.set_title(kwargs.get("chart_title", "")) ax.set_xlabel(xlabel) ax.set_ylabel(y_label) - ax.legend(title="System") plt.xscale("log", base=2) plt.yscale("log", base=2) plt.grid(True) @@ -263,6 +263,8 @@ def make_stacked_line_chart(**kwargs): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) plt.gca().ticklabel_format(style="plain") + plt.legend(title="System") + # handles, labels = ax.get_legend_handles_labels() # handles = list(reversed(handles)) # labels = list(reversed(labels)) @@ -311,7 +313,11 @@ def prepare_data(**kwargs): tk = th.Thicket.from_caliperreader( files, intersection=intersection, disable_tqdm=True ) - tk.update_inclusive_columns() + if kwargs["yaxis_metric"] in tk.inc_metrics and not kwargs["no_update_inc_cols"]: + pbar = tqdm(total=1, desc="Updating inclusive columns") + tk.update_inclusive_columns() + pbar.update(1) + pbar.close() cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"])) @@ -641,6 +647,11 @@ def setup_parser(root_parser): choices=["area", "line", "bar", "scatter"], help="Type of chart to generate", ) + root_parser.add_argument( + "--no-update-inc-cols", + action="store_true", + help="Don't call Thicket.update_inclusive_columns() which can take a while." + ) def command(args): From 2fc1a96258ef92c9c9abfaacf16a2566c14170a0 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 12 Sep 2025 15:04:45 -0700 Subject: [PATCH 7/9] Major refactor to get both multi cluster visualizations working and keep stacked area chart for single cluster --- lib/benchpark/cmd/analyze.py | 219 +++++++++++++++++++---------------- 1 file changed, 118 insertions(+), 101 deletions(-) diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py index b2de138b8..6e7c7ae1b 100644 --- a/lib/benchpark/cmd/analyze.py +++ b/lib/benchpark/cmd/analyze.py @@ -112,10 +112,6 @@ def _validate_workspace_dir(workspace_dir): raise ValueError( f"Workspace dir '{workspace_dir}' does not exist or is not a directory" ) - if ".ramble-workspace" not in os.listdir(workspace_dir): - raise ValueError( - f"Directory '{workspace_dir}' must be a valid ramble workspace (missing .ramble-workspace)" - ) return os.path.abspath(workspace_dir) @@ -171,9 +167,9 @@ def analyze_archive(analyze_dir, cali_files, output=None): # ----------------------------- # Chart Generation # ----------------------------- -def make_stacked_line_chart(**kwargs): +def make_chart(**kwargs): """ - Generates a stacked area line chart based on Thicket DataFrame. + Generates a chart based on Thicket DataFrame. Args: df (pd.DataFrame): DataFrame to plot. @@ -200,32 +196,24 @@ def make_stacked_line_chart(**kwargs): os.makedirs(kwargs["out_dir"], exist_ok=True) - tdf_calls = df[[(i, "Calls/rank (max)") for i in x_axis]].T.reset_index( - level=1, drop=True - ) - calls_list = [] - for column in tdf_calls.columns: - mx = max(tdf_calls[column]) - val = int(mx) if mx > 0 else 0 - calls_list.append((column, val)) + # tdf_calls = df["Calls/rank (max)"].T.reset_index( + # level=1, drop=True + # ) + # calls_list = [] + # for column in tdf_calls.columns: + # mx = max(tdf_calls[column]) + # val = int(mx) if mx > 0 else 0 + # calls_list.append((column, val)) - tdf = df[[(i, value) for i in x_axis]].T.reset_index(level=1, drop=True) + # tdf = df[[(i, value) for i in x_axis]].T.reset_index(level=1, drop=True) mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=COLOR_PALETTE) if kwargs.get("chart_fontsize"): mpl.rcParams.update({"font.size": kwargs.get("chart_fontsize")}) - mapping = { - "lassen": f"ats-2 {kwargs.get('cluster_to_ps')['lassen']}", - "dane": f"cts-2 {kwargs.get('cluster_to_ps')['dane']}", - "tuolumne": f"ats-4 {kwargs.get('cluster_to_ps')['tuolumne']}", - "rzgenie": f"cts-1 {kwargs.get('cluster_to_ps')['rzgenie']}", - "poodle": f"ats-3 {kwargs.get('cluster_to_ps')['poodle']}" - } - - tcol = tdf.columns[0] - tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping) - tdf["profile"] = tdf.index.map(lambda x: int(x[1])) - tdf = tdf.reset_index(drop=True) + # tcol = tdf.columns[0] + # tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping) + # tdf["profile"] = tdf.index.map(lambda x: ", ".join(str(i) for i in x[:-1])) + # tdf = tdf.reset_index(drop=True) xlabel = kwargs.get("chart_xlabel") if isinstance(xlabel, list): @@ -238,36 +226,50 @@ def make_stacked_line_chart(**kwargs): ax.set_title(kwargs.get("chart_title", "")) ax.set_xlabel(xlabel) ax.set_ylabel(y_label) - plt.xscale("log", base=2) - plt.yscale("log", base=2) + # plt.yscale("log", base=2) plt.grid(True) + df = df.sort_values(by=x_axis) plot_args = dict( - data=tdf, - # kind=kind, ax=ax, - hue=("cluster", ""), - x=("profile", ""), - y=tcol, ) + if kind == "area": + plot_args["kind"] = "area" + df["xaxis"] = df.apply(lambda row: tuple(row[col] for col in x_axis), axis=1) + else: + plot_args["data"] = df + plot_args["x"] = "xaxis" + plot_args["y"] = yaxis_metric + df["xaxis"] = df.apply( + lambda row: ", ".join([str(row[col]) for col in x_axis]), axis=1 + ) + if kwargs["cluster"] == "multiple": + plot_args["hue"] = "cluster" # Add marker only if line plot if kind == "line": plot_args["marker"] = "o" - seaborn.lineplot(**plot_args) + seaborn.lineplot(**plot_args) + elif kind == "area": + tdf = ( + df[[yaxis_metric, "name", "xaxis"]] + .reset_index(drop=True) + .sort_values("xaxis") + ) + tdf = tdf.pivot(index="xaxis", columns="name", values=yaxis_metric) + tdf.plot(**plot_args) + elif kind == "scatter": + seaborn.scatterplot(**plot_args) + elif kind == "bar": + seaborn.barplot(**plot_args) + else: + raise NotImplementedError(f"Uknown plot kind {kind}") + y_axis_limits = kwargs.get("chart_yaxis_limits") if y_axis_limits is not None: ax.set_ylim(y_axis_limits[0], y_axis_limits[1]) - ax.set_xlim(1, 128) - - from matplotlib.ticker import ScalarFormatter - plt.gca().yaxis.set_major_formatter(ScalarFormatter()) - plt.gca().xaxis.set_major_formatter(ScalarFormatter()) - plt.gca().ticklabel_format(style="plain") - plt.legend(title="System") - - # handles, labels = ax.get_legend_handles_labels() - # handles = list(reversed(handles)) - # labels = list(reversed(labels)) + handles, labels = ax.get_legend_handles_labels() + handles = list(reversed(handles)) + labels = list(reversed(labels)) # calls_list = list(reversed(calls_list)) # for i, label in enumerate(labels): # obj = calls_list[i][0] @@ -275,20 +277,21 @@ def make_stacked_line_chart(**kwargs): # if name not in label: # raise ValueError(f"Name '{name}' is not in label '{label}'") # labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")" - # ax.legend( - # handles, - # labels, - # bbox_to_anchor=(1, 0.5), - # loc="center left", - # title="Region (Calls/rank (max))", - # ) + ax.legend( + handles, + labels, + bbox_to_anchor=(1, 0.5), + loc="center left", + title="Region (Calls/rank (max))", + ) + ax.set_xlabel(xlabel) fig.autofmt_xdate() plt.tight_layout() filename = os.path.join(kwargs["out_dir"], kwargs["chart_file_name"]) logger.info(f"Saving figure data points to {filename}.csv") - tdf.to_csv(filename + ".csv") + df.to_csv(filename + ".csv") logger.info(f"Saving figure to {filename}.png") plt.savefig(filename + ".png") logger.info( @@ -319,7 +322,7 @@ def prepare_data(**kwargs): pbar.update(1) pbar.close() - cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"])) + # cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"])) clean_tree = tk.tree(kwargs["tree_metric"], render_header=True) clean_tree = re.compile(r"\x1b\[([0-9;]*m)").sub("", clean_tree) @@ -415,16 +418,22 @@ def prepare_data(**kwargs): tk.dataframe = pd.concat([tk.dataframe.filter(like=p, axis=0) for p in prefix]) # Group by varied parameters - grouped = tk.groupby(x_axis_metadata + ["cluster"]) - ctk = th.Thicket.concat_thickets( - list(grouped.values()), headers=list(grouped.keys()), axis="columns" - ) + # grouped = tk.groupby(x_axis_metadata) + # print(grouped.keys()) + # ctk = th.Thicket.concat_thickets( + # list(grouped.values()), headers=list(grouped.keys()), axis="index" + # ) + + tk.metadata_columns_to_perfdata(["cluster"] + list(NAME_REMAP.keys())) cluster_col = "cluster" if "cluster" in tk.metadata.columns else "host.cluster" # Check these values are constant app = validate_single_metadata_value("application_name", tk) - # cluster = validate_single_metadata_value(cluster_col, tk) - cluster = "multiple" + try: + cluster = validate_single_metadata_value(cluster_col, tk) + except ValueError: + print("Multiple clusters detected. Using multi-cluster mode.") + cluster = "multiple" version = validate_single_metadata_value("version", tk) # Find programming model from spec @@ -439,14 +448,18 @@ def prepare_data(**kwargs): "weak": ["process_problem_size"], "throughput": ["n_resources", "n_nodes"], }[scaling] - # constant_str = ", ".join( - # f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}" for key in constant_keys - # ) - constant_str = "" + constant_str = ( + ", ".join( + f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}" + for key in constant_keys + ) + if cluster != "multiple" + else "" + ) # Check constant - for key in constant_keys: - # validate_single_metadata_value(key, tk) - pass + if cluster != "multiple": + for key in constant_keys: + validate_single_metadata_value(key, tk) if not kwargs.get("chart_title"): kwargs["chart_title"] = ( @@ -466,27 +479,27 @@ def prepare_data(**kwargs): f.write(clean_tree) logger.info(f"Saving Input Calltree to {tree_file}") - for key in grouped.keys(): - ctk.dataframe[(key, "perc")] = ( - ctk.dataframe[(key, metric)] / ctk.dataframe[(key, metric)].sum() - ) * 100 - - top_n = kwargs.get("top_n_regions", -1) - if top_n != -1: - temp_df_idx = ctk.dataframe.nlargest( - top_n, [(list(grouped.keys())[0], metric)] - ).index - temp_df = ctk.dataframe[ctk.dataframe.index.isin(temp_df_idx)] - temp_df.loc["Sum(removed_regions)"] = 0 - for p in ctk.profile: - temp_df.loc["Sum(removed_regions)", (p[1], metric)] = ( - ctk.dataframe.loc[:, (p[1], metric)].sum() - - temp_df.loc[:, (p[1], metric)].sum() - ).iloc[0] - ctk.dataframe = temp_df - logger.info( - f"Filtered top {top_n} regions for chart display. Added the sum of the regions that were removed as single region." - ) + # for key in grouped.keys(): + # tk.dataframe["perc"] = tk.dataframe[tk.dataframe[g] == ] + # ctk.dataframe[(key, "perc")] = ( + # ctk.dataframe[(key, metric)] / ctk.dataframe[(key, metric)].sum() + # ) * 100 + + # top_n = kwargs.get("top_n_regions", -1) + # if top_n != -1: + # temp_df_idx = tk.dataframe.nlargest( + # top_n, metric).index + # temp_df = tk.dataframe[tk.dataframe.index.isin(temp_df_idx)] + # temp_df.loc["Sum(removed_regions)"] = 0 + # for p in tk.profile: + # temp_df.loc["Sum(removed_regions)", metric] = ( + # tk.dataframe.loc[:, metric].sum() + # - temp_df.loc[:, metric].sum() + # ) + # tk.dataframe = temp_df + # logger.info( + # f"Filtered top {top_n} regions for chart display. Added the sum of the regions that were removed as single region." + # ) if not kwargs.get("chart_xlabel"): kwargs["chart_xlabel"] = x_axis_metadata @@ -499,9 +512,10 @@ def prepare_data(**kwargs): raise ValueError( f"Expected one scaling factor, found: {list(scaling_factors)}" ) - kwargs["cluster_to_ps"] = cluster_to_ps + # kwargs["cluster_to_ps"] = cluster_to_ps + kwargs["cluster"] = cluster - make_stacked_line_chart(df=ctk.dataframe, x_axis=list(grouped.keys()), **kwargs) + make_chart(df=tk.dataframe, x_axis=x_axis_metadata, **kwargs) def setup_parser(root_parser): @@ -513,7 +527,7 @@ def setup_parser(root_parser): "--workspace-dir", required=True, type=str, - help="Directory of ramble workspace.", + help="Directory Caliper files. Files will be found recursively.", metavar="RAMBLE_WORKSPACE_DIR", ) root_parser.add_argument( @@ -526,7 +540,10 @@ def setup_parser(root_parser): root_parser.add_argument( "--chart-type", default="raw", - choices=["raw", "percentage"], + choices=[ + "raw", + # "percentage" + ], type=str, help="Specify processing on the metric. 'raw' does nothing, 'percentage' shows the metric values as a percentage relative to the total summation of all regions.", ) @@ -560,13 +577,13 @@ def setup_parser(root_parser): help="Query for one or more regions REGION. Includes children of region.", metavar="REGION", ) - root_parser.add_argument( - "--top-n-regions", - default=-1, - type=int, - help="Filters only top N largest metric entries to be included in chart (based on the first profile).", - metavar="N", - ) + # root_parser.add_argument( + # "--top-n-regions", + # default=-1, + # type=int, + # help="Filters only top N largest metric entries to be included in chart (based on the first profile).", + # metavar="N", + # ) root_parser.add_argument( "--group-regions-name", action="store_true", @@ -650,7 +667,7 @@ def setup_parser(root_parser): root_parser.add_argument( "--no-update-inc-cols", action="store_true", - help="Don't call Thicket.update_inclusive_columns() which can take a while." + help="Don't call Thicket.update_inclusive_columns() which can take a while.", ) From 2255d14afb2f1b53eb98e254367e9be07e27c95c Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 12 Sep 2025 15:43:47 -0700 Subject: [PATCH 8/9] Try to formulate problem sizes into experiment --- experiments/raja-perf/experiment.py | 27 +++++++++++++++++++++++++-- systems/llnl-cluster/system.py | 7 +++++++ systems/llnl-elcapitan/system.py | 2 ++ systems/llnl-sierra/system.py | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/experiments/raja-perf/experiment.py b/experiments/raja-perf/experiment.py index d2fa8df6e..f830408aa 100644 --- a/experiments/raja-perf/experiment.py +++ b/experiments/raja-perf/experiment.py @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import math + from benchpark.directives import variant, maintainers from benchpark.experiment import Experiment from benchpark.mpi import MpiOnlyExperiment @@ -35,16 +37,37 @@ class RajaPerf( description="app version", ) + variant( + "exec_mode", + default="test", + values=("test", "perf", "cache_study"), + description="Execution mode", + ) + maintainers("michaelmckinsey1") def compute_applications_section(self): if self.spec.satisfies("exec_mode=test"): - # Per-process size + self.add_experiment_variable("process_problem_size", 1048576, True) + self.add_experiment_variable("n_resources", 1, False) + elif self.spec.satisfies("exec_mode=cache_study"): + sys = self.system_spec.system + sockets = sys.sys_sockets_per_node + L3size_bytes = sys.sys_cpu_L3_MB * 10**6 + L2size_bytes = sys.sys_cpu_L2_KB * 10**3 + num_cores = sys.sys_cores_per_node / sockets + div_constant = 16 # 2 doubles per problem size + cache_constant = 4 # Make sure data stays in cache + if hasattr(sys, "sys_ccd_per_node"): + L3size_bytes *= sys.sys_ccd_per_node / sockets + problem_size = (sockets * (L3size_bytes + L2size_bytes * num_cores)) * cache_constant / div_constant + nearest_power_of_two = 2 ** round(math.log2(problem_size)) + self.add_experiment_variable("process_problem_size", nearest_power_of_two, True) #self.add_experiment_variable("process_problem_size", 134217728, True) #dane #self.add_experiment_variable("process_problem_size", 67108864, True) #lassen #self.add_experiment_variable("process_problem_size", 268435456, True) # tuo #self.add_experiment_variable("process_problem_size", 32*1024*1024, True) # rzgenie - self.add_experiment_variable("process_problem_size", 128*1024*1024, True) # poodle + #self.add_experiment_variable("process_problem_size", 128*1024*1024, True) # poodle # Number of processes self.add_experiment_variable("n_resources", 1, False) diff --git a/systems/llnl-cluster/system.py b/systems/llnl-cluster/system.py index 2323cde96..ac7cd78d3 100644 --- a/systems/llnl-cluster/system.py +++ b/systems/llnl-cluster/system.py @@ -39,6 +39,7 @@ class LlnlCluster(System): }, "dane": { "sys_cores_per_node": 112, + "sys_sockets_per_node": 2, "sys_cores_os_reserved_per_node": 0, # No explicit core reservation, first thread on each core reserved (2 threads per core) "sys_cores_os_reserved_per_node_list": None, "sys_mem_per_node_GB": 256, @@ -54,12 +55,18 @@ class LlnlCluster(System): "rzgenie": { "sys_cores_per_node": 36, "system_site": "llnl", + "sys_sockets_per_node": 2, + "sys_cpu_L2_KB": 256, + "sys_cpu_L3_MB": 45, "hardware_key": str(hardware_descriptions) + "/Penguin-haswell-OmniPath/hardware_description.yaml", "queues": [JobQueue("pdebug", 720, 43)], }, "poodle": { "sys_cores_per_node": 112, + "sys_sockets_per_node": 2, + "sys_cpu_L2_KB": 2048, + "sys_cpu_L3_MB": 112.5, # Depends on partition (could be 105) "system_site": "llnl", "hardware_key": str(hardware_descriptions) + "/DELL-sapphirerapids-OmniPath/hardware_description.yaml", diff --git a/systems/llnl-elcapitan/system.py b/systems/llnl-elcapitan/system.py index 040f1e032..5a7c29d0c 100644 --- a/systems/llnl-elcapitan/system.py +++ b/systems/llnl-elcapitan/system.py @@ -58,6 +58,8 @@ class LlnlElcapitan(System): ], # 3 cores reserved per socket "sys_gpus_per_node": None, # Determined by "gpumode" variant "sys_sockets_per_node": 4, + "sys_ccd_per_node": 12, + "sys_xcd_per_node": 24, "sys_mem_per_node_GB": 512, "sys_cpu_mem_per_node_MB": 3072, "sys_gpu_mem_per_node_GB": 512, diff --git a/systems/llnl-sierra/system.py b/systems/llnl-sierra/system.py index 0267e9635..5efc972e6 100644 --- a/systems/llnl-sierra/system.py +++ b/systems/llnl-sierra/system.py @@ -19,6 +19,7 @@ class LlnlSierra(System): "lassen": { "cuda_arch": 70, "sys_cores_per_node": 40, + "sys_sockets_per_node": 2, "sys_cores_os_reserved_per_node": 4, "sys_cores_os_reserved_per_node_list": [ 0, From f9639cab3de995a9e0d883ab58a672b50dcfeb0c Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Mon, 15 Sep 2025 12:02:58 -0700 Subject: [PATCH 9/9] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6091dfd14..3adac6c8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,6 @@ explicit_start = false [project.optional-dependencies] analyze = [ - "llnl-thicket[plotting]", + "llnl-thicket[plotting]==2025.1.0", "matplotlib" -] \ No newline at end of file +]