E3SM-Project · bartgol · Nov 1, 2024 · Oct 29, 2024 · Oct 30, 2024 · Oct 31, 2024
diff --git a/.github/actions/test-all-scream/action.yml b/.github/actions/test-all-scream/action.yml
@@ -95,7 +95,7 @@ runs:
       if: always()
       uses: actions/upload-artifact@v4
       with:
-        name: log-files-${{ inputs.build_type }}
+        name: log-files-${{ inputs.build_type }}-${{ inputs.machine }}
         path: |
           components/eamxx/ctest-build/*/Testing/Temporary/Last*.log
           components/eamxx/ctest-build/*/ctest_resource_file.json

diff --git a/.github/workflows/eamxx-scripts-tests.yml b/.github/workflows/eamxx-scripts-tests.yml
@@ -8,8 +8,6 @@ on:
     paths:
       - components/eamxx/scripts/**
       - components/eamxx/cime_config/*.py
-  pull_request_review:
-    types: [submitted]
 
   # Manual run for debug purposes only
   workflow_dispatch:
@@ -19,13 +17,10 @@ on:
     - cron: '0 7 * * *'  # Runs at 7 AM UTC, which is midnight MT during Standard Time
 
 concurrency:
-  # Two runs are in the same group if:
-  #  - they have the same trigger
-  #  - if trigger=pull_request/pull_request_review, the PR number must match
-  #  - if trigger=workflow_dispatch/schedule: no concurrency
-  group: ${{ github.workflow }}-${{ github.event_name }}-${{
-             (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
-           }}
+  # Two runs are in the same group if they are testing the same git ref
+  #  - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
+  #  - for other triggers, the ref is the branch tested
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:

diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml
@@ -11,8 +11,6 @@ on:
       - components/eam/src/physics/p3/scream/**
       - components/eam/src/physics/cam/**
       - .github/workflows/eamxx-standalone-testing.yml
-  pull_request_review:
-    types: [submitted]
 
   # Manual run is used to bless
   workflow_dispatch:
@@ -23,6 +21,7 @@ on:
         type: choice
         options:
           - gcc-openmp
+          - gcc-cuda
       bless:
         description: 'Generate baselines'
         required: true
@@ -33,13 +32,10 @@ on:
     - cron: '0 7 * * *'  # Runs at 7 AM UTC, which is midnight MT during Standard Time
 
 concurrency:
-  # Two runs are in the same group if:
-  #  - they have the same trigger
-  #  - if trigger=pull_request/pull_request_review, the PR number must match
-  #  - if trigger=workflow_dispatch/schedule: no concurrency
-  group: ${{ github.workflow }}-${{ github.event_name }}-${{
-             (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
-           }}
+  # Two runs are in the same group if they are testing the same git ref
+  #  - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
+  #  - for other triggers, the ref is the branch tested
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -86,30 +82,46 @@ jobs:
           generate: ${{ env.generate }}
           submit: ${{ env.submit }}
           cmake-configs: Kokkos_ENABLE_OPENMP=ON
-  # cuda:
-  #   # Disable until the CUDA container is up and running. When CUDA container is availabe, remove
-  #   # this line and uncomment the next if
-  #   if: false
-  #   # Runs always for pull_request. For workflow_dispatch, user must request this machine
-  #   # if: ${{ github.event_name == 'pull_request' || contains(github.event.inputs.jobs_to_run, 'openmp-gcc') }}
-  #   runs-on:  [self-hosted, cuda]
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       build_type: [sp, dbg, fpe, opt]
-  #   name: cuda-${{ matrix.build_type }}
-  #   steps:
-  #     - name: Show action trigger
-  #       uses: ./.github/actions/print-workflow-trigger
-  #     - name: Check out the repository
-  #       uses: actions/checkout@v4
-  #       with:
-  #         persist-credentials: false
-  #         show-progress: false
-  #         submodules: recursive
-  #     - name: Run tests
-  #       uses: ./.github/actions/test-all-scream
-  #       with:
-  #         build_type: ${{ matrix.build_type }}
-  #         machine: ghci-snl-cuda
-  #         run_type: at-run
+  gcc-cuda:
+    runs-on:  [self-hosted, ghci-snl-cuda, cuda, gcc]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [sp, dbg, opt]
+    if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'gcc-cuda') }}
+    name: gcc-cuda / ${{ matrix.build_type }}
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          show-progress: false
+          submodules: recursive
+      - name: Show action trigger
+        uses: ./.github/actions/show-workflow-trigger
+      - name: Check for skip labels
+        if: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_review' }}
+        uses: ./.github/actions/check-skip-labels
+        with:
+          skip_labels: 'AT: skip gcc,AT: skip cuda,AT: skip eamxx-sa,AT: skip eamxx-all'
+          token: ${{ secrets.GITHUB_TOKEN }}
+          pr_number: ${{ github.event.pull_request.number }}
+      - name: Set test-all inputs based on event specs
+        run: |
+          echo "submit=false" >> $GITHUB_ENV
+          echo "generate=false" >> $GITHUB_ENV
+          if [ "${{ github.event_name }}" == "schedule" ]; then
+            echo "submit=true" >> $GITHUB_ENV
+          elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+            if [ "${{ inputs.bless }}" == "true" ]; then
+              echo "generate=true" >> $GITHUB_ENV
+            fi
+          fi
+      - name: Run tests
+        uses: ./.github/actions/test-all-scream
+        with:
+          build_type: ${{ matrix.build_type }}
+          machine: ghci-snl-cuda
+          generate: ${{ env.generate }}
+          submit: ${{ env.submit }}
+          cmake-configs: Kokkos_ARCH_VOLTA70=ON;CMAKE_CUDA_ARCHITECTURES=70
diff --git a/.github/workflows/eamxx-v1-testing.yml b/.github/workflows/eamxx-v1-testing.yml
@@ -11,8 +11,6 @@ on:
       - components/eam/src/physics/p3/scream/**
       - components/eam/src/physics/cam/**
       - .github/workflows/eamxx-v1-testing.yml
-  pull_request_review:
-    types: [submitted]
 
   # Manual run is used to bless
   workflow_dispatch:
@@ -29,13 +27,10 @@ on:
         type: boolean
 
 concurrency:
-  # Two runs are in the same group if:
-  #  - they have the same trigger
-  #  - if trigger=pull_request/pull_request_review: the PR number must match
-  #  - if trigger=workflow_dispatch: no concurrency
-  group: ${{ github.workflow }}-${{ github.event_name }}-${{
-             (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
-           }}
+  # Two runs are in the same group if they are testing the same git ref
+  #  - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
+  #  - for other triggers, the ref is the branch tested
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -54,10 +49,6 @@ jobs:
             short_name: SMS_D_Ln5.ne4pg2_oQU480.F2010-SCREAMv1-MPASSI.scream-mam4xx-all_mam4xx_procs
       fail-fast: false
     name: cpu-gcc / ${{ matrix.test.short_name }}
-    # Run this workflow if:
-    #   - workflow_dispatch: user requested this job.
-    #   - schedule: always:
-    #   - pull_request/pull_request_review: matching skip label is NOT found
     if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'cpu-gcc') }}
     steps:
       - name: Check out the repository

diff --git a/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake b/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake
@@ -0,0 +1,14 @@
+# Common settings for our ghci images
+include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake)
+
+# Set SCREAM_MACHINE
+set(SCREAM_MACHINE ghci-snl-cuda CACHE STRING "")
+
+# Enable CUDA in kokkos
+set (EKAT_MACH_FILES_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../../externals/ekat/cmake/machine-files)
+include (${EKAT_MACH_FILES_PATH}/kokkos/cuda.cmake)
+
+set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks")
+
+# TODO: rebuild cuda image with cuda-aware MPI, so we can set this to ON
+option(SCREAM_MPI_ON_DEVICE "Whether to use device pointers for MPI calls" OFF)
diff --git a/components/eamxx/scripts/machines_specs.py b/components/eamxx/scripts/machines_specs.py
@@ -215,6 +215,16 @@ def setup(cls):
         super().setup_base("ghci-snl-cpu")
         cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cpu"
 
+###############################################################################
+class GHCISNLCuda(Machine):
+###############################################################################
+    concrete = True
+    @classmethod
+    def setup(cls):
+        super().setup_base(name="ghci-snl-cuda",num_bld_res=16,num_run_res=1)
+        cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cuda"
+        cls.gpu_arch = "cuda"
+
 ###############################################################################
 class Lassen(Machine):
 ###############################################################################

diff --git a/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp b/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp
@@ -62,6 +62,11 @@ TEST_CASE("wind_speed")
   register_diagnostics();
 
   constexpr int ntests = 5;
+#ifdef NDEBUG
+  constexpr int ulp_tol = 1;
+#else
+  constexpr int ulp_tol = 0;
+#endif
   for (int itest=0; itest<ntests; ++itest) {
     // Randomize wind
     randomize(uv,engine,pdf);
@@ -87,7 +92,7 @@ TEST_CASE("wind_speed")
       for (int ilev=0; ilev<nlevs; ++ilev) {
         const auto u = uv_h (icol,0,ilev);
         const auto v = uv_h (icol,1,ilev);
-        REQUIRE (ws_h(icol,ilev) == std::sqrt(u*u+v*v));
+        REQUIRE_THAT (ws_h(icol,ilev), Catch::Matchers::WithinULP(std::sqrt(u*u+v*v),ulp_tol));
       }
     }
   }

diff --git a/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp b/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp
@@ -33,15 +33,15 @@ int main (int argc, char** argv) {
     auto logger = std::make_shared<logger_t>("",LogLevel::info,comm);
 
     // Get filenames from command line
-    if (argc != 3) {
+    if (argc < 3) {
       std::string msg = "Missing required inputs. Usage:\n";
       msg += argv[0];
       msg += " inputfile baseline\n";
       logger->error(msg);
       return 1;
     }
-    std::string inputfile(argv[argc-2]);
-    std::string baseline(argv[argc-1]);
+    std::string inputfile(argv[1]);
+    std::string baseline(argv[2]);
 
     // Initialize yakl
     yakl::init();