From 1b57e3cdc1a599114fea13a633501a309049655b Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Fri, 4 Apr 2025 21:27:26 -0600 Subject: [PATCH 1/9] EAMxx: add machine file for ghci-snl-hip --- .../cmake/machine-files/ghci-snl-hip.cmake | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 components/eamxx/cmake/machine-files/ghci-snl-hip.cmake diff --git a/components/eamxx/cmake/machine-files/ghci-snl-hip.cmake b/components/eamxx/cmake/machine-files/ghci-snl-hip.cmake new file mode 100644 index 000000000000..7016bf381a79 --- /dev/null +++ b/components/eamxx/cmake/machine-files/ghci-snl-hip.cmake @@ -0,0 +1,17 @@ +# Common settings for our ghci images +include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake) + +# Set SCREAM_MACHINE +set(SCREAM_MACHINE ghci-snl-hip CACHE STRING "") + +# Enable CUDA in kokkos +set (EKAT_MACH_FILES_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../../externals/ekat/cmake/machine-files) +include (${EKAT_MACH_FILES_PATH}/kokkos/hip.cmake) + +set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks") + +# TODO: rebuild cuda image with cuda-aware MPI, so we can set this to ON +option(SCREAM_MPI_ON_DEVICE "Whether to use device pointers for MPI calls" OFF) + +# Currently, we have 2 GPUs/node on Blake, and we run a SINGLE build per node, so we can fit 2 ranks there +set(SCREAM_TEST_MAX_RANKS 2 CACHE STRING "Upper limit on ranks for mpi tests") From b6954626e484cfe020327776fc7715bb27617e27 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 17 Apr 2025 15:33:14 -0600 Subject: [PATCH 2/9] EAMxx: add ghci-snl-hip entry to machine_specs.py and cacts.yaml --- components/eamxx/cacts.yaml | 8 ++++++++ components/eamxx/scripts/machines_specs.py | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/components/eamxx/cacts.yaml b/components/eamxx/cacts.yaml index 16d42ca4574e..d851fc401ccd 100644 --- a/components/eamxx/cacts.yaml +++ b/components/eamxx/cacts.yaml @@ -140,6 +140,14 @@ machines: gpu_arch: "cuda" num_run_res: "$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)" + ghci-snl-hip: + baselines_dir: /projects/e3sm/baselines/scream/ghci-snl-hip + gpu_arch: hip + num_run_res: $(rocm-smi --showproductname | grep 'GPU\[' | awk '{print $1}' | sort -u | wc -l) + env_setup: + - source /etc/profile.d/setup-user-env.sh + - export MPICH_CXX=hipcc + ghci-oci: env_setup: ["eval $(${project.root_dir}/../../cime/CIME/Tools/get_case_env -c SMS.ne4pg2_ne4pg2.F2010-SCREAMv1.ghci-oci_gnu)"] diff --git a/components/eamxx/scripts/machines_specs.py b/components/eamxx/scripts/machines_specs.py index e9d18987e4e9..db0d9a6723e7 100644 --- a/components/eamxx/scripts/machines_specs.py +++ b/components/eamxx/scripts/machines_specs.py @@ -283,6 +283,19 @@ def setup(cls): cls.gpu_arch = "cuda" cls.num_run_res = int(run_cmd_no_fail("nvidia-smi --query-gpu=name --format=csv,noheader | wc -l")) +############################################################################### +class GHCISNLHip(Machine): +############################################################################### + concrete = True + @classmethod + def setup(cls): + super().setup_base(name="ghci-snl-hip") + cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-hip" + cls.gpu_arch = "hip" + cls.num_run_res = int(run_cmd_no_fail("rocm-smi --showproductname | grep 'GPU\[' | awk '{print $1}' | sort -u | wc -l")) + cls.cxx_compiler = "mpicxx" + cls.env_setup = ["source /etc/profile.d/setup-user-env.sh && export MPICH_CXX=hipcc"] + ############################################################################### class GHCIOCI(Machine): ############################################################################### From dd6de66cf209fa2db90408999a4c619a592bb94f Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 17 Apr 2025 15:34:36 -0600 Subject: [PATCH 3/9] Workflows: add gcc-hip job to eamxx-sa --- .github/workflows/eamxx-sa-testing.yml | 39 ++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/.github/workflows/eamxx-sa-testing.yml b/.github/workflows/eamxx-sa-testing.yml index d136b1b4ef5d..1d060ae06e5b 100644 --- a/.github/workflows/eamxx-sa-testing.yml +++ b/.github/workflows/eamxx-sa-testing.yml @@ -161,3 +161,42 @@ jobs: submit: ${{ env.submit }} cmake-configs: Kokkos_ARCH_HOPPER90=${{ env.Hopper }};Kokkos_ARCH_AMPERE80=${{ env.Ampere }};Kokkos_ARCH_VOLTA70=${{ env.Volta }};CMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }};SCREAM_SMALL_KERNELS=${{ matrix.test.SK }} ekat: ${{ env.ekat }} + gcc-hip: + if: | + ${{ + github.event_name != 'workflow_dispatch' || + ( + github.event.inputs.job_to_run == 'gcc-hip' || + github.event.inputs.job_to_run == 'all' + ) + }} + runs-on: [self-hosted, ghci-snl-hip, hip, gcc] + strategy: + fail-fast: false + matrix: + test: + # - build_type: sp NEED TO CHECK WHY DEBUG BUILDS FAIL + # SK: OFF + # - build_type: dbg + # SK: ON + - build_type: opt + SK: OFF + name: gcc-hip / ${{ matrix.test.build_type }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + with: + persist-credentials: false + show-progress: false + submodules: recursive + - name: Show action trigger + uses: ./.github/actions/show-workflow-trigger + - name: Run tests + uses: ./.github/actions/test-all-eamxx + with: + build_type: ${{ matrix.test.build_type }} + machine: ghci-snl-cuda + generate: ${{ env.generate }} + submit: ${{ env.submit }} + cmake-configs: Kokkos_ARCH_HOPPER90=${{ env.Hopper }};Kokkos_ARCH_AMPERE80=${{ env.Ampere }};Kokkos_ARCH_VOLTA70=${{ env.Volta }};CMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }};SCREAM_SMALL_KERNELS=${{ matrix.test.SK }} + ekat: ${{ env.ekat }} From dd36125b0ff82f13fbc02c79cb7492258b18c50c Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 1 May 2025 16:16:35 -0600 Subject: [PATCH 4/9] EAMxx: fix warning on HIP that trigger ctest failure These warnings contain a code snippet which contain the keyword ERROR, which ctest interprets as a build error --- components/eamxx/src/control/atmosphere_driver.cpp | 4 ++-- components/eamxx/src/share/atm_process/atmosphere_process.cpp | 2 +- components/eamxx/src/share/io/eamxx_scorpio_interface.cpp | 2 +- components/eamxx/src/share/io/tests/io_basic.cpp | 2 +- components/eamxx/src/share/io/tests/io_filled.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/components/eamxx/src/control/atmosphere_driver.cpp b/components/eamxx/src/control/atmosphere_driver.cpp index 4b2b114e88ef..3a78e9868772 100644 --- a/components/eamxx/src/control/atmosphere_driver.cpp +++ b/components/eamxx/src/control/atmosphere_driver.cpp @@ -479,7 +479,7 @@ void AtmosphereDriver::setup_column_conservation_checks () const std::string fail_handling_type_str = driver_options_pl.get("column_conservation_checks_fail_handling_type", "warning"); - CheckFailHandling fail_handling_type; + CheckFailHandling fail_handling_type = CheckFailHandling::Fatal; if (fail_handling_type_str == "warning") { fail_handling_type = CheckFailHandling::Warning; } else if (fail_handling_type_str == "fatal") { @@ -995,7 +995,7 @@ void AtmosphereDriver::create_logger () { "Invalid string for 'Atm Log File': '" + log_fname + "'.\n"); auto str2lev = [](const std::string& s, const std::string& name) { - LogLevel lev; + LogLevel lev = LogLevel::err; if (s=="trace") { lev = LogLevel::trace; } else if (s=="debug") { diff --git a/components/eamxx/src/share/atm_process/atmosphere_process.cpp b/components/eamxx/src/share/atm_process/atmosphere_process.cpp index cba233b9e5b2..f2540f22affe 100644 --- a/components/eamxx/src/share/atm_process/atmosphere_process.cpp +++ b/components/eamxx/src/share/atm_process/atmosphere_process.cpp @@ -15,7 +15,7 @@ namespace scream ekat::logger::LogLevel str2LogLevel (const std::string& s) { using namespace ekat::logger; - ekat::logger::LogLevel log_level; + ekat::logger::LogLevel log_level = LogLevel::err; if (s=="off") { log_level = LogLevel::off; } else if (s=="trace") { diff --git a/components/eamxx/src/share/io/eamxx_scorpio_interface.cpp b/components/eamxx/src/share/io/eamxx_scorpio_interface.cpp index 4bacad593406..050bbf4c226a 100644 --- a/components/eamxx/src/share/io/eamxx_scorpio_interface.cpp +++ b/components/eamxx/src/share/io/eamxx_scorpio_interface.cpp @@ -1415,7 +1415,7 @@ T get_attribute (const std::string& filename, err = PIOc_inq_atttype(pf.file->ncid,varid,attname.c_str(),&att_type); check_scorpio_noerr(err,filename,"attribute",attname,"get_attribute","inq_atttype"); - T val; + T val = {}; if (att_type!=nctype(get_dtype())) { if (att_type==PIO_INT) { diff --git a/components/eamxx/src/share/io/tests/io_basic.cpp b/components/eamxx/src/share/io/tests/io_basic.cpp index 85730b5b6401..6914791d8abc 100644 --- a/components/eamxx/src/share/io/tests/io_basic.cpp +++ b/components/eamxx/src/share/io/tests/io_basic.cpp @@ -37,7 +37,7 @@ void add (const Field& f, const double v) { } int get_dt (const std::string& freq_units) { - int dt; + int dt = -1; if (freq_units=="nsteps") { dt = 1; } else if (freq_units=="nsecs") { diff --git a/components/eamxx/src/share/io/tests/io_filled.cpp b/components/eamxx/src/share/io/tests/io_filled.cpp index 6ff9d4486442..23b604e7510c 100644 --- a/components/eamxx/src/share/io/tests/io_filled.cpp +++ b/components/eamxx/src/share/io/tests/io_filled.cpp @@ -40,7 +40,7 @@ void set (const Field& f, const double v) { } int get_dt (const std::string& freq_units) { - int dt; + int dt = -1; if (freq_units=="nsteps") { dt = 1; } else if (freq_units=="nsecs") { From 570a568a6dd7b9f514323c4cd320f8913c36b626 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 1 May 2025 16:17:09 -0600 Subject: [PATCH 5/9] EAMxx: fix cacts.yaml settings Conform them to v0.2.1 changes --- components/eamxx/cacts.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/components/eamxx/cacts.yaml b/components/eamxx/cacts.yaml index d851fc401ccd..e268de45b399 100644 --- a/components/eamxx/cacts.yaml +++ b/components/eamxx/cacts.yaml @@ -39,12 +39,13 @@ project: name: EAMxx baseline_gen_label: baseline_gen baseline_summary_file: baseline_list - cmake_vars_names: - enable_baselines: + cmake_settings: + baselines_on: SCREAM_ENABLE_BASELINE_TESTS: ON - generate_baselines: + baselines_off: + SCREAM_ENABLE_BASELINE_TESTS: OFF + baselines_only: SCREAM_ONLY_GENERATE_BASELINES: ON - baselines_dir: SCREAM_BASELINES_DIR cdash: drop_site: my.cdash.org drop_location: /submit.php?project=E3SM From 47d1eccec8c453583133e7dbb5d0cb915a8fb23a Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 1 May 2025 16:17:36 -0600 Subject: [PATCH 6/9] EAMxx: fix pip usage in eamxx-cacts wrapper --- components/eamxx/scripts/eamxx-cacts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/eamxx/scripts/eamxx-cacts b/components/eamxx/scripts/eamxx-cacts index cac749dd4e47..b1333f870cb8 100755 --- a/components/eamxx/scripts/eamxx-cacts +++ b/components/eamxx/scripts/eamxx-cacts @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Ensure cacts package is installed and up to date -pip install --user --upgrade cacts +python3 -m pip install --user --trusted-host pypi.org --upgrade cacts # Run cacts cacts "$@" From f76e0e3156f7f0f8c872236d1897e7259384459f Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 1 May 2025 17:11:00 -0600 Subject: [PATCH 7/9] Workflows: switch HIP testing to use CACTS package Beside the fact that we will transition to that for all jobs anyways, the HIP build is having a hard time getting the compilers to work with test-all-eamxx --- .github/workflows/eamxx-sa-testing.yml | 33 ++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/.github/workflows/eamxx-sa-testing.yml b/.github/workflows/eamxx-sa-testing.yml index 1d060ae06e5b..05e325c8dad1 100644 --- a/.github/workflows/eamxx-sa-testing.yml +++ b/.github/workflows/eamxx-sa-testing.yml @@ -31,6 +31,7 @@ on: options: - gcc-openmp - gcc-cuda + - gcc-hip - all bless: description: 'Generate baselines' @@ -191,12 +192,30 @@ jobs: submodules: recursive - name: Show action trigger uses: ./.github/actions/show-workflow-trigger + - name: Ensure CACTS is installed + run: | + python3 -m pip install --user --upgrade --trusted-host pypi.org cacts - name: Run tests - uses: ./.github/actions/test-all-eamxx + run: | + cmd="cacts -m ghci-snl-hip -t ${{ matrix.test.build_type }} -b auto -r ./components/eamxx" + if [[ "${{ env.generate }}" == "true" ]]; then + cmd+=" -g" + fi + if [[ "${{ env.submit }}" == "true" ]]; then + cmd+=" -s" + fi + if [[ "${{ env.ekat }}" == "true" ]]; then + cmd+=" -c EKAT_ENABLE_TESTS=ON" + fi + echo "CACTS call: $cmd" + $cmd + - name: Upload files + if: always() + uses: actions/upload-artifact@v4 with: - build_type: ${{ matrix.test.build_type }} - machine: ghci-snl-cuda - generate: ${{ env.generate }} - submit: ${{ env.submit }} - cmake-configs: Kokkos_ARCH_HOPPER90=${{ env.Hopper }};Kokkos_ARCH_AMPERE80=${{ env.Ampere }};Kokkos_ARCH_VOLTA70=${{ env.Volta }};CMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }};SCREAM_SMALL_KERNELS=${{ matrix.test.SK }} - ekat: ${{ env.ekat }} + name: log-files-${{ matrix.test.build_type }}-ghci-snl-hip + path: | + components/eamxx/ctest-build/*/Testing/Temporary/Last*.log + components/eamxx/ctest-build/*/ctest_resource_file.json + components/eamxx/ctest-build/*/ctest_script.cmake + components/eamxx/ctest-build/*/CMakeCache.txt From f40317019195a568aaab019b9ec0242e126c19a1 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 1 May 2025 17:35:14 -0600 Subject: [PATCH 8/9] EAMxx: add missing cmake arg to default build config in cacts.yaml --- components/eamxx/cacts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/eamxx/cacts.yaml b/components/eamxx/cacts.yaml index e268de45b399..4c7a0c91cff0 100644 --- a/components/eamxx/cacts.yaml +++ b/components/eamxx/cacts.yaml @@ -228,6 +228,8 @@ configurations: description: null uses_baselines: True on_by_default: True + cmake_args: + SCREAM_BASELINES_DIR: ${machine.baselines_dir}/${build.longname} dbg: longname: full_debug From c002bce91c09aaf85e5b69e8fb1a0fb6ecbcad46 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Mon, 5 May 2025 09:27:09 -0600 Subject: [PATCH 9/9] EAMxx: fix pylint error in GHCISNLHip entry of machine_specs.py --- components/eamxx/scripts/machines_specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/eamxx/scripts/machines_specs.py b/components/eamxx/scripts/machines_specs.py index db0d9a6723e7..bdefaa87e159 100644 --- a/components/eamxx/scripts/machines_specs.py +++ b/components/eamxx/scripts/machines_specs.py @@ -292,7 +292,7 @@ def setup(cls): super().setup_base(name="ghci-snl-hip") cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-hip" cls.gpu_arch = "hip" - cls.num_run_res = int(run_cmd_no_fail("rocm-smi --showproductname | grep 'GPU\[' | awk '{print $1}' | sort -u | wc -l")) + cls.num_run_res = int(run_cmd_no_fail(r"rocm-smi --showproductname | grep 'GPU\[' | awk '{print $1}' | sort -u | wc -l")) cls.cxx_compiler = "mpicxx" cls.env_setup = ["source /etc/profile.d/setup-user-env.sh && export MPICH_CXX=hipcc"]