From 6dc8af9e4bc72e1af3a051b70cc5a6aa858a5921 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Tue, 29 Oct 2024 14:00:54 -0600 Subject: [PATCH 01/10] EAMxx: add support for ghci-snl-cuda in standalone testing --- .../eamxx/cmake/machine-files/ghci-snl-cuda.cmake | 11 +++++++++++ components/eamxx/scripts/machines_specs.py | 10 ++++++++++ 2 files changed, 21 insertions(+) create mode 100644 components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake diff --git a/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake b/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake new file mode 100644 index 00000000000..5d0743fcdbe --- /dev/null +++ b/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake @@ -0,0 +1,11 @@ +# Common settings for our ghci images +include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake) + +# Set SCREAM_MACHINE +set(SCREAM_MACHINE ghci-snl-cuda CACHE STRING "") + +# Enable CUDA in kokkos +set (EKAT_MACH_FILES_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../../externals/ekat/cmake/machine-files) +include (${EKAT_MACH_FILES_PATH}/kokkos/cuda.cmake) + +set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks") diff --git a/components/eamxx/scripts/machines_specs.py b/components/eamxx/scripts/machines_specs.py index 2878b976165..41d09f4683e 100644 --- a/components/eamxx/scripts/machines_specs.py +++ b/components/eamxx/scripts/machines_specs.py @@ -215,6 +215,16 @@ def setup(cls): super().setup_base("ghci-snl-cpu") cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cpu" +############################################################################### +class GHCISNLCuda(Machine): +############################################################################### + concrete = True + @classmethod + def setup(cls): + super().setup_base(name="ghci-snl-cuda",num_bld_res=16,num_run_res=1) + cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cuda" + cls.gpu_arch = "cuda" + ############################################################################### class Lassen(Machine): ############################################################################### From cc4a3e57d6f4995af250caae33f8fd3edd403926 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Wed, 30 Oct 2024 17:50:57 -0600 Subject: [PATCH 02/10] Workflows: add support for ghci-snl-cuda also in eamxx-sa workflow --- .../workflows/eamxx-standalone-testing.yml | 89 +++++++++++++------ 1 file changed, 62 insertions(+), 27 deletions(-) diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml index 6eb00798225..da3beca1da9 100644 --- a/.github/workflows/eamxx-standalone-testing.yml +++ b/.github/workflows/eamxx-standalone-testing.yml @@ -23,6 +23,7 @@ on: type: choice options: - gcc-openmp + - gcc-cuda bless: description: 'Generate baselines' required: true @@ -86,30 +87,64 @@ jobs: generate: ${{ env.generate }} submit: ${{ env.submit }} cmake-configs: Kokkos_ENABLE_OPENMP=ON - # cuda: - # # Disable until the CUDA container is up and running. When CUDA container is availabe, remove - # # this line and uncomment the next if - # if: false - # # Runs always for pull_request. For workflow_dispatch, user must request this machine - # # if: ${{ github.event_name == 'pull_request' || contains(github.event.inputs.jobs_to_run, 'openmp-gcc') }} - # runs-on: [self-hosted, cuda] - # strategy: - # fail-fast: false - # matrix: - # build_type: [sp, dbg, fpe, opt] - # name: cuda-${{ matrix.build_type }} - # steps: - # - name: Show action trigger - # uses: ./.github/actions/print-workflow-trigger - # - name: Check out the repository - # uses: actions/checkout@v4 - # with: - # persist-credentials: false - # show-progress: false - # submodules: recursive - # - name: Run tests - # uses: ./.github/actions/test-all-scream - # with: - # build_type: ${{ matrix.build_type }} - # machine: ghci-snl-cuda - # run_type: at-run + gcc-cuda: + runs-on: [self-hosted, ghci-snl-cuda, cuda, gcc] + strategy: + fail-fast: false + matrix: + build_type: [sp, dbg, opt] + # Run this workflow if: + # - workflow_dispatch: user requested this job. + # - schedule: always: + # - pull_request: matching skip label is NOT found + if: | + ${{ + github.event_name == 'schedule' || + ( github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list == 'gcc-cuda' ) || + ( + github.event_name == 'pull_request' && + !( + contains(github.event.pull_request.labels.*.name, 'AT: skip gcc') || + contains(github.event.pull_request.labels.*.name, 'AT: skip cuda') || + contains(github.event.pull_request.labels.*.name, 'AT: skip eamxx-sa') || + contains(github.event.pull_request.labels.*.name, 'AT: skip eamxx-all') + ) + ) + }} + name: gcc-cuda / ${{ matrix.build_type }} + steps: + - name: Show action trigger + uses: actions/github-script@v7 + with: + script: | + const eventName = context.eventName; + const eventAction = context.payload.action || 'N/A'; + const actor = context.actor; + console.log(`The job was triggered by a ${eventName} event.`); + console.log(` - Event action: ${eventAction}`); + console.log(` - Triggered by: ${actor}`); + - name: Check out the repository + uses: actions/checkout@v4 + with: + persist-credentials: false + show-progress: false + submodules: recursive + - name: Set test-all inputs based on event specs + run: | + echo "submit=false" >> $GITHUB_ENV + echo "generate=false" >> $GITHUB_ENV + if [ "${{ github.event_name }}" == "schedule" ]; then + echo "submit=true" >> $GITHUB_ENV + elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + if [ "${{ inputs.bless }}" == "true" ]; then + echo "generate=true" >> $GITHUB_ENV + fi + fi + - name: Run tests + uses: ./.github/actions/test-all-scream + with: + build_type: ${{ matrix.build_type }} + machine: ghci-snl-cuda + generate: ${{ env.generate }} + submit: ${{ env.submit }} + cmake-configs: Kokkos_ARCH_VOLTA70=ON;CMAKE_CUDA_ARCHITECTURES=70 From cdde3e08c718828fb8e114ce616a414c64215e10 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Wed, 30 Oct 2024 20:34:39 -0600 Subject: [PATCH 03/10] Workflows: skip eamxx v1/sa testing on draft PRs --- .../workflows/eamxx-standalone-testing.yml | 38 +++++-------------- .github/workflows/eamxx-v1-testing.yml | 4 -- 2 files changed, 10 insertions(+), 32 deletions(-) diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml index da3beca1da9..07fa9afdb52 100644 --- a/.github/workflows/eamxx-standalone-testing.yml +++ b/.github/workflows/eamxx-standalone-testing.yml @@ -93,42 +93,24 @@ jobs: fail-fast: false matrix: build_type: [sp, dbg, opt] - # Run this workflow if: - # - workflow_dispatch: user requested this job. - # - schedule: always: - # - pull_request: matching skip label is NOT found - if: | - ${{ - github.event_name == 'schedule' || - ( github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list == 'gcc-cuda' ) || - ( - github.event_name == 'pull_request' && - !( - contains(github.event.pull_request.labels.*.name, 'AT: skip gcc') || - contains(github.event.pull_request.labels.*.name, 'AT: skip cuda') || - contains(github.event.pull_request.labels.*.name, 'AT: skip eamxx-sa') || - contains(github.event.pull_request.labels.*.name, 'AT: skip eamxx-all') - ) - ) - }} + if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'gcc-cuda') }} name: gcc-cuda / ${{ matrix.build_type }} steps: - - name: Show action trigger - uses: actions/github-script@v7 - with: - script: | - const eventName = context.eventName; - const eventAction = context.payload.action || 'N/A'; - const actor = context.actor; - console.log(`The job was triggered by a ${eventName} event.`); - console.log(` - Event action: ${eventAction}`); - console.log(` - Triggered by: ${actor}`); - name: Check out the repository uses: actions/checkout@v4 with: persist-credentials: false show-progress: false submodules: recursive + - name: Show action trigger + uses: ./.github/actions/show-workflow-trigger + - name: Check for skip labels + if: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_review' }} + uses: ./.github/actions/check-skip-labels + with: + skip_labels: 'AT: skip gcc,AT: skip cuda,AT: skip eamxx-sa,AT: skip eamxx-all' + token: ${{ secrets.GITHUB_TOKEN }} + pr_number: ${{ github.event.pull_request.number }} - name: Set test-all inputs based on event specs run: | echo "submit=false" >> $GITHUB_ENV diff --git a/.github/workflows/eamxx-v1-testing.yml b/.github/workflows/eamxx-v1-testing.yml index ce652406845..b4331578871 100644 --- a/.github/workflows/eamxx-v1-testing.yml +++ b/.github/workflows/eamxx-v1-testing.yml @@ -54,10 +54,6 @@ jobs: short_name: SMS_D_Ln5.ne4pg2_oQU480.F2010-SCREAMv1-MPASSI.scream-mam4xx-all_mam4xx_procs fail-fast: false name: cpu-gcc / ${{ matrix.test.short_name }} - # Run this workflow if: - # - workflow_dispatch: user requested this job. - # - schedule: always: - # - pull_request/pull_request_review: matching skip label is NOT found if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'cpu-gcc') }} steps: - name: Check out the repository From 0b5adb69f594333a07e783889d84db4109e8011c Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 11:05:55 -0600 Subject: [PATCH 04/10] EAMxx: fix parsing of inputs in rrtmgp generate_baseline We must allow extra args. E.g., --ekat-kokkos-device N may be added by EKAT's test-launcher --- .../eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp b/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp index 97b3887acc8..ec27719abed 100644 --- a/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp +++ b/components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp @@ -33,15 +33,15 @@ int main (int argc, char** argv) { auto logger = std::make_shared("",LogLevel::info,comm); // Get filenames from command line - if (argc != 3) { + if (argc < 3) { std::string msg = "Missing required inputs. Usage:\n"; msg += argv[0]; msg += " inputfile baseline\n"; logger->error(msg); return 1; } - std::string inputfile(argv[argc-2]); - std::string baseline(argv[argc-1]); + std::string inputfile(argv[1]); + std::string baseline(argv[2]); // Initialize yakl yakl::init(); From 6df95839ed83ddadbd50a4f44b673cd726ae4b01 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 15:19:00 -0600 Subject: [PATCH 05/10] Workflows: use different artifact names for different machines in TAS action Prevents errors when same workflow runs TAS action with different input machines --- .github/actions/test-all-scream/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/test-all-scream/action.yml b/.github/actions/test-all-scream/action.yml index 9f347e97d47..6ad60b8f31b 100644 --- a/.github/actions/test-all-scream/action.yml +++ b/.github/actions/test-all-scream/action.yml @@ -95,7 +95,7 @@ runs: if: always() uses: actions/upload-artifact@v4 with: - name: log-files-${{ inputs.build_type }} + name: log-files-${{ inputs.build_type }}-${{ inputs.machine }} path: | components/eamxx/ctest-build/*/Testing/Temporary/Last*.log components/eamxx/ctest-build/*/ctest_resource_file.json From 7382c0c4407d2c36feee5c2d9266f5891c7d9c5f Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 15:23:24 -0600 Subject: [PATCH 06/10] Workflows: fix concurrency group in eamxx workflows Both pull_request and pull_request_review must be in the same concurrency group --- .github/workflows/eamxx-scripts-tests.yml | 2 +- .github/workflows/eamxx-standalone-testing.yml | 2 +- .github/workflows/eamxx-v1-testing.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eamxx-scripts-tests.yml b/.github/workflows/eamxx-scripts-tests.yml index c1efdcc6184..4e78b18a4d0 100644 --- a/.github/workflows/eamxx-scripts-tests.yml +++ b/.github/workflows/eamxx-scripts-tests.yml @@ -23,7 +23,7 @@ concurrency: # - they have the same trigger # - if trigger=pull_request/pull_request_review, the PR number must match # - if trigger=workflow_dispatch/schedule: no concurrency - group: ${{ github.workflow }}-${{ github.event_name }}-${{ + group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml index 07fa9afdb52..53766290c47 100644 --- a/.github/workflows/eamxx-standalone-testing.yml +++ b/.github/workflows/eamxx-standalone-testing.yml @@ -38,7 +38,7 @@ concurrency: # - they have the same trigger # - if trigger=pull_request/pull_request_review, the PR number must match # - if trigger=workflow_dispatch/schedule: no concurrency - group: ${{ github.workflow }}-${{ github.event_name }}-${{ + group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/eamxx-v1-testing.yml b/.github/workflows/eamxx-v1-testing.yml index b4331578871..0ecec3453e8 100644 --- a/.github/workflows/eamxx-v1-testing.yml +++ b/.github/workflows/eamxx-v1-testing.yml @@ -33,7 +33,7 @@ concurrency: # - they have the same trigger # - if trigger=pull_request/pull_request_review: the PR number must match # - if trigger=workflow_dispatch: no concurrency - group: ${{ github.workflow }}-${{ github.event_name }}-${{ + group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id }} cancel-in-progress: true From fc82112cee80e19a4d5621be299965f35b2834c9 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 17:21:36 -0600 Subject: [PATCH 07/10] Workflows: simplified concurrency of eamxx workflows --- .github/workflows/eamxx-scripts-tests.yml | 11 ++++------- .github/workflows/eamxx-standalone-testing.yml | 11 ++++------- .github/workflows/eamxx-v1-testing.yml | 11 ++++------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/.github/workflows/eamxx-scripts-tests.yml b/.github/workflows/eamxx-scripts-tests.yml index 4e78b18a4d0..d3abce089c9 100644 --- a/.github/workflows/eamxx-scripts-tests.yml +++ b/.github/workflows/eamxx-scripts-tests.yml @@ -19,13 +19,10 @@ on: - cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time concurrency: - # Two runs are in the same group if: - # - they have the same trigger - # - if trigger=pull_request/pull_request_review, the PR number must match - # - if trigger=workflow_dispatch/schedule: no concurrency - group: ${{ github.workflow }}-${{ - (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id - }} + # Two runs are in the same group if they are testing the same git ref + # - if trigger=pull_request, the ref is refs/pull//merge + # - for other triggers, the ref is the branch tested + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml index 53766290c47..cb87b540715 100644 --- a/.github/workflows/eamxx-standalone-testing.yml +++ b/.github/workflows/eamxx-standalone-testing.yml @@ -34,13 +34,10 @@ on: - cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time concurrency: - # Two runs are in the same group if: - # - they have the same trigger - # - if trigger=pull_request/pull_request_review, the PR number must match - # - if trigger=workflow_dispatch/schedule: no concurrency - group: ${{ github.workflow }}-${{ - (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id - }} + # Two runs are in the same group if they are testing the same git ref + # - if trigger=pull_request, the ref is refs/pull//merge + # - for other triggers, the ref is the branch tested + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: diff --git a/.github/workflows/eamxx-v1-testing.yml b/.github/workflows/eamxx-v1-testing.yml index 0ecec3453e8..398a894f747 100644 --- a/.github/workflows/eamxx-v1-testing.yml +++ b/.github/workflows/eamxx-v1-testing.yml @@ -29,13 +29,10 @@ on: type: boolean concurrency: - # Two runs are in the same group if: - # - they have the same trigger - # - if trigger=pull_request/pull_request_review: the PR number must match - # - if trigger=workflow_dispatch: no concurrency - group: ${{ github.workflow }}-${{ - (github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id - }} + # Two runs are in the same group if they are testing the same git ref + # - if trigger=pull_request, the ref is refs/pull//merge + # - for other triggers, the ref is the branch tested + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: From 823b774ef6496e8d4a00fdb94fa6741e1a5f6a96 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 17:22:52 -0600 Subject: [PATCH 08/10] Workflows: remove pull_request_review event from eamxx workflows --- .github/workflows/eamxx-scripts-tests.yml | 2 -- .github/workflows/eamxx-standalone-testing.yml | 2 -- .github/workflows/eamxx-v1-testing.yml | 2 -- 3 files changed, 6 deletions(-) diff --git a/.github/workflows/eamxx-scripts-tests.yml b/.github/workflows/eamxx-scripts-tests.yml index d3abce089c9..aad6e9ac229 100644 --- a/.github/workflows/eamxx-scripts-tests.yml +++ b/.github/workflows/eamxx-scripts-tests.yml @@ -8,8 +8,6 @@ on: paths: - components/eamxx/scripts/** - components/eamxx/cime_config/*.py - pull_request_review: - types: [submitted] # Manual run for debug purposes only workflow_dispatch: diff --git a/.github/workflows/eamxx-standalone-testing.yml b/.github/workflows/eamxx-standalone-testing.yml index cb87b540715..3e51a5a5b77 100644 --- a/.github/workflows/eamxx-standalone-testing.yml +++ b/.github/workflows/eamxx-standalone-testing.yml @@ -11,8 +11,6 @@ on: - components/eam/src/physics/p3/scream/** - components/eam/src/physics/cam/** - .github/workflows/eamxx-standalone-testing.yml - pull_request_review: - types: [submitted] # Manual run is used to bless workflow_dispatch: diff --git a/.github/workflows/eamxx-v1-testing.yml b/.github/workflows/eamxx-v1-testing.yml index 398a894f747..ff10d48cec7 100644 --- a/.github/workflows/eamxx-v1-testing.yml +++ b/.github/workflows/eamxx-v1-testing.yml @@ -11,8 +11,6 @@ on: - components/eam/src/physics/p3/scream/** - components/eam/src/physics/cam/** - .github/workflows/eamxx-v1-testing.yml - pull_request_review: - types: [submitted] # Manual run is used to bless workflow_dispatch: From 3724d459ddd94694a35c359aa07001e00ac24ea7 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 17:33:59 -0600 Subject: [PATCH 09/10] EAMxx: disable cuda-aware MPI in ghci-snl-cuda --- components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake b/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake index 5d0743fcdbe..72a352f09d7 100644 --- a/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake +++ b/components/eamxx/cmake/machine-files/ghci-snl-cuda.cmake @@ -9,3 +9,6 @@ set (EKAT_MACH_FILES_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../../externals/ekat/c include (${EKAT_MACH_FILES_PATH}/kokkos/cuda.cmake) set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks") + +# TODO: rebuild cuda image with cuda-aware MPI, so we can set this to ON +option(SCREAM_MPI_ON_DEVICE "Whether to use device pointers for MPI calls" OFF) From 5da88db0c8f11a1249f5fd2dee8d3d1042046715 Mon Sep 17 00:00:00 2001 From: Luca Bertagna Date: Thu, 31 Oct 2024 19:35:18 -0600 Subject: [PATCH 10/10] EAMxx: allow 1ulp diff in wind_speed test in release mode --- .../eamxx/src/diagnostics/tests/wind_speed_tests.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp b/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp index 5e6be61ba4f..1298536c423 100644 --- a/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp +++ b/components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp @@ -62,6 +62,11 @@ TEST_CASE("wind_speed") register_diagnostics(); constexpr int ntests = 5; +#ifdef NDEBUG + constexpr int ulp_tol = 1; +#else + constexpr int ulp_tol = 0; +#endif for (int itest=0; itest