From 61efe1d2e600bc66bebfc66aebe6f6f207d5b5a8 Mon Sep 17 00:00:00 2001 From: Michael J Schmidt Date: Mon, 25 Aug 2025 15:33:21 -0600 Subject: [PATCH 1/4] ff mam x validation for new compare script --- src/validation/mam_x_validation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/validation/mam_x_validation b/src/validation/mam_x_validation index a2617c5d3..8aec10cb9 160000 --- a/src/validation/mam_x_validation +++ b/src/validation/mam_x_validation @@ -1 +1 @@ -Subproject commit a2617c5d3009b233bc6460406b074cc8a1bd2a19 +Subproject commit 8aec10cb969aa795ff83dfe1fbccd275d36e9f63 From dfa8dd24e7382658210f807b4652ee4cc2c6ffac Mon Sep 17 00:00:00 2001 From: Michael J Schmidt Date: Wed, 20 Aug 2025 13:13:08 -0600 Subject: [PATCH 2/4] adds AMD/HIP autotesting via AT2 --- .github/workflows/README.md | 2 +- .github/workflows/at2_gcc-hip.yml | 122 ++++++++++++++++++++++ .github/workflows/m4x_autotester_main.yml | 13 +++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/at2_gcc-hip.yml diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 576d80d15..1a267198d 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -12,7 +12,7 @@ To do this, testing is initialized via the top-level workflow, `MAM4xx Autoteste #### GPU-based Testing -| Test Name | GPU Brand | GPU Type | Micoarchitecture | Compute Capability | Machine | Compilers | +| Test Name | GPU Brand | GPU Type | Microarchitecture | Compute Capability | Machine | Compilers | | --------------------------------- | --------- | -------- | ---------------- | ------------------ | ------- | ---------------------------- | | GPU AT2 gcc 12.3 cuda 12.1 | NVIDIA | H100 | Hopper | 9.0 | blake | `gcc` 12.3.0/`nvcc` 12.1.105 | diff --git a/.github/workflows/at2_gcc-hip.yml b/.github/workflows/at2_gcc-hip.yml new file mode 100644 index 000000000..8cbd94ec3 --- /dev/null +++ b/.github/workflows/at2_gcc-hip.yml @@ -0,0 +1,122 @@ +name: "GPU AT2 gcc 13.3 hip 6.2" + +on: + workflow_call: + inputs: + precision: + required: false + type: string + build_type: + required: false + type: string + +jobs: + # this is more work than I'd expect, but this is how you pass info after + # operating on it in a job/step + # TODO: factor this out into an action? + # parse the inputs from the workflow call that'll be used by strategy.matrix + define_matrix: + runs-on: ubuntu-22.04 + # define the outputs that will come from the steps below + outputs: + build_type: ${{ steps.build_type.outputs.build_type }} + precision: ${{ steps.precision.outputs.precision }} + steps: + - name: Define build_type + id: build_type + env: + # if empty (i.e., triggered by PR) make ALL default + btype: ${{ inputs.build_type || 'ALL' }} + # this is a little over-cautious, since the 'else' should never happen + run: | + case ${{ env.btype }} in + "Debug") + echo 'build_type=["Debug"]' >> "$GITHUB_OUTPUT" ;; + "Release") + echo 'build_type=["Release"]' >> "$GITHUB_OUTPUT" ;; + "ALL") + echo 'build_type=["Debug", "Release"]' >> "$GITHUB_OUTPUT" ;; + *) + echo 'build_type=["Debug", "Release"]' >> "$GITHUB_OUTPUT" ;; + esac + - name: Define precision + id: precision + env: + prec: ${{ inputs.precision || 'ALL' }} + run: | + case ${{ env.prec }} in + "Debug") + echo 'precision=["single"]' >> "$GITHUB_OUTPUT" ;; + "Release") + echo 'precision=["double"]' >> "$GITHUB_OUTPUT" ;; + "ALL") + echo 'precision=["single", "double"]' >> "$GITHUB_OUTPUT" ;; + *) + echo 'precision=["single", "double"]' >> "$GITHUB_OUTPUT" ;; + esac + gcc-cuda: + runs-on: [self-hosted, m4xci-snl-hip, hip, gcc] + # will run other tests in the matrix even if one fails + # NOTE: prioritizes extra info over speed, so consider whether this makes sense + continue-on-error: false + needs: define_matrix + # A build matrix storing all desired configurations. + strategy: + fail-fast: true + matrix: + # to get the array instead of a string, need the fromJSON() + build-type: ${{ fromJSON(needs.define_matrix.outputs.build_type) }} + fp-precision: ${{ fromJSON(needs.define_matrix.outputs.precision) }} + name: gcc-hip / ${{ matrix.build-type }} - ${{ matrix.fp-precision }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + with: + persist-credentials: false + show-progress: false + submodules: recursive + - name: Cloning Haero + uses: actions/checkout@v4 + with: + repository: eagles-project/haero + submodules: recursive + path: haero_src + - name: Show action trigger + uses: ./.github/actions/show-workflow-trigger + - name: Building Haero (${{ matrix.build-type }}, ${{ matrix.fp-precision }} precision) + run: | + cmake -S haero_src -B haero_build \ + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ + -DCMAKE_INSTALL_PREFIX="haero_install" \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DHAERO_SKIP_FIND_YAML_CPP=ON \ + -DHAERO_ENABLE_MPI=OFF \ + -DHAERO_ENABLE_GPU=ON \ + -DHAERO_PRECISION=${{ matrix.fp-precision }} \ + -DKokkos_ARCH_AMD_GFX90A=ON \ + -DHAERO_DEVICE_ARCH=AMD_GFX90A + cd haero_build + make -j + make install + - name: Configuring MAM4xx (${{ matrix.build-type }}, ${{ matrix.fp-precision }} precision) + run: | + cmake -S . -B build \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_INSTALL_PREFIX=$(pwd)/install \ + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ + -DMAM4XX_HAERO_DIR=$(pwd)/haero_install \ + -DNUM_VERTICAL_LEVELS=72 \ + -DENABLE_COVERAGE=OFF \ + -DENABLE_SKYWALKER=ON \ + -DCMAKE_CUDA_ARCHITECTURES=AMD_GFX90A \ + -G "Unix Makefiles" + - name: Building MAM4xx (${{ matrix.build-type }}, ${{ matrix.fp-precision }} precision) + run: | + cd build + make -j + - name: Running tests (${{ matrix.build-type }}, ${{ matrix.fp-precision }} precision) + run: | + cd build + ctest -V --output-on-failure diff --git a/.github/workflows/m4x_autotester_main.yml b/.github/workflows/m4x_autotester_main.yml index b48fba26d..5f6afb937 100644 --- a/.github/workflows/m4x_autotester_main.yml +++ b/.github/workflows/m4x_autotester_main.yml @@ -31,6 +31,7 @@ on: default: 'GPU-NVIDIA_H100' options: - GPU-NVIDIA_H100 + - GPU-AMD_MI200-series - CPU-Ubuntu_22-04 - ALL precision: @@ -71,6 +72,10 @@ jobs: if: ${{ github.event.pull_request || github.event.schedule }} uses: ./.github/workflows/at2_gcc-cuda.yml + gcc-hip: + if: ${{ github.event.pull_request || github.event.schedule }} + uses: + ./.github/workflows/at2_gcc-hip.yml gcc-cpu_gh: if: ${{ github.event.pull_request || github.event.schedule }} secrets: @@ -89,6 +94,14 @@ jobs: build_type: ${{ github.event.inputs.build_type }} uses: "./.github/workflows/at2_gcc-cuda.yml" + manual-gpu_hip: + if: ${{ contains(github.event.inputs.architecture, 'GPU-AMD_MI200-series') || + contains(github.event.inputs.architecture, 'ALL') }} + with: + precision: ${{ github.event.inputs.precision }} + build_type: ${{ github.event.inputs.build_type }} + uses: + "./.github/workflows/at2_gcc-hip.yml" manual-cpu_gh: if: ${{ contains(github.event.inputs.architecture, 'CPU-Ubuntu_22-04') || contains(github.event.inputs.architecture, 'ALL') }} From 57c5c05c7e553a8a2d710c6ecb0b293ddbb948ee Mon Sep 17 00:00:00 2001 From: Michael J Schmidt Date: Fri, 22 Aug 2025 13:12:07 -0600 Subject: [PATCH 3/4] correct precision choosing bug for manual trigger --- .github/workflows/at2_gcc-cuda.yml | 7 +++---- .github/workflows/at2_gcc-hip.yml | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/at2_gcc-cuda.yml b/.github/workflows/at2_gcc-cuda.yml index cc0f2a347..3d4cb98d0 100644 --- a/.github/workflows/at2_gcc-cuda.yml +++ b/.github/workflows/at2_gcc-cuda.yml @@ -45,9 +45,9 @@ jobs: prec: ${{ inputs.precision || 'ALL' }} run: | case ${{ env.prec }} in - "Debug") + "single") echo 'precision=["single"]' >> "$GITHUB_OUTPUT" ;; - "Release") + "double") echo 'precision=["double"]' >> "$GITHUB_OUTPUT" ;; "ALL") echo 'precision=["single", "double"]' >> "$GITHUB_OUTPUT" ;; @@ -56,8 +56,7 @@ jobs: esac gcc-cuda: runs-on: [self-hosted, m4xci-snl-cuda, cuda, gcc] - # will run other tests in the matrix even if one fails - # NOTE: prioritizes extra info over speed, so consider whether this makes sense + # will not run other tests in the matrix if one fails continue-on-error: false needs: define_matrix # A build matrix storing all desired configurations. diff --git a/.github/workflows/at2_gcc-hip.yml b/.github/workflows/at2_gcc-hip.yml index 8cbd94ec3..c9e669911 100644 --- a/.github/workflows/at2_gcc-hip.yml +++ b/.github/workflows/at2_gcc-hip.yml @@ -45,9 +45,9 @@ jobs: prec: ${{ inputs.precision || 'ALL' }} run: | case ${{ env.prec }} in - "Debug") + "single") echo 'precision=["single"]' >> "$GITHUB_OUTPUT" ;; - "Release") + "double") echo 'precision=["double"]' >> "$GITHUB_OUTPUT" ;; "ALL") echo 'precision=["single", "double"]' >> "$GITHUB_OUTPUT" ;; @@ -56,8 +56,7 @@ jobs: esac gcc-cuda: runs-on: [self-hosted, m4xci-snl-hip, hip, gcc] - # will run other tests in the matrix even if one fails - # NOTE: prioritizes extra info over speed, so consider whether this makes sense + # will not run other tests in the matrix if one fails continue-on-error: false needs: define_matrix # A build matrix storing all desired configurations. From 24f926c3c1d686376412c423c7931533c16333c6 Mon Sep 17 00:00:00 2001 From: Michael J Schmidt Date: Fri, 22 Aug 2025 15:57:04 -0600 Subject: [PATCH 4/4] update READMEs --- .github/workflows/AT2-README.md | 27 +++++++++++++++++---------- .github/workflows/README.md | 25 +++++++++++++++++-------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/.github/workflows/AT2-README.md b/.github/workflows/AT2-README.md index f69a13a4e..78ce3fc74 100644 --- a/.github/workflows/AT2-README.md +++ b/.github/workflows/AT2-README.md @@ -1,5 +1,6 @@ # Autotester2 (AT2) Workflow for MAM4xx - This document contains a brief description of how AT2 is used to automate testing on SNL hardware. + +This document contains a brief description of how AT2 is used to automate testing on SNL hardware. Additionally, any helpful notes and TODOs may be kept here to assist developers. ## Overview @@ -10,9 +11,10 @@ This is done for security/policy reasons and ensures that only those with approv ### Test Hardware and Compiler Configurations -| Test Name | GPU Brand | GPU Type | Micoarchitecture | Compute Capability | Machine | Compilers | -| -------------------- | --------- | -------- | ---------------- | ------------------ | ------- | ---------------------------- | -| gcc_12-3-0_cuda_12-1 | NVIDIA | H100 | Hopper | 9.0 | blake | `gcc` 12.3.0/`nvcc` 12.1.105 | +| Test Name | GPU Brand | GPU Type | Microarchitecture | Compute Capability | Machine | OS | Compilers | +| --------------------------------- | --------- | ------------| ----------------- | ------------------ | ------- | ------ | -------------------------------- | +| GPU AT2 gcc 12.3 cuda 12.1 | NVIDIA | H100 | Hopper | 9.0 | blake | RHEL8 | `gcc` 12.3.0/`nvcc` 12.1.105 | +| GPU AT2 gcc 13.3 hip 6.2 | AMD | MI250/MI210 | AMD_GFX90A | N/A | caraway | RHEL9 | `gcc` 13.3.0/`hipcc` 6.2.41133-0 | ### The Flow of the CI Workflow @@ -24,7 +26,8 @@ As of now, the image is of a UBI 8 system, with Spack-installed compilers and al #### Triggering the Testing Workflow -This autotesting workflow is triggered by opening a pull request to `main` and also by a handful of actions on such a PR that is already open, including: +This autotesting workflow is triggered by opening a pull request to `main` and +also by a handful of actions on such a PR that is already open, including: - `reopened` - `ready_for_review` @@ -40,8 +43,8 @@ or > **Actions** -> `` -> **Re-run `[all,this]` job(s)**. -The AT2 configuration on `blake` currently attempts to keep 3 runners available -to accept jobs at all times. +The AT2 configuration on `blake` and `caraway` currently attempts to keep 3 +runners per machine available to accept jobs at all times. This workflow is configured to allow concurrent testing, so up to 3 test-matrix configurations can run at once. The concurrency setting is also configured to kill any active job if another @@ -58,13 +61,17 @@ instance of this workflow is started for the same PR ref. ## Development Details -Most of the required configuration is provided by the AT2 docs and instructional Confluence page (on the Sandia network :confused:--reach out if you need access). +Most of the required configuration is provided by the AT2 docs and instructional +Confluence page (on the Sandia network :confused:--reach out if you need access). However, some non-obvious choices and configurations are listed here. -- To add some info to the testing output, we employ a custom action, cribbed from E3SM/EAMxx, that prints out the workflow's trigger. +- To add some info to the testing output, we employ a custom action, cribbed +from E3SM/EAMxx, that prints out the workflow's trigger. ### Hacks +- [ ] FIXME(@mjs): This should not be necessary any more, after the changes to the haero build. `build-haero.sh` should be functional for this build now. + - For whatever reason, Skywalker does not like building in the `gcc_12-3-0_cuda_12-1` container for the H100 GPU. - This appears to be an issue of the (Haero?) build not auto-detecting the correct Compute Capability (CC 9.0 => `sm_90`). - To overcome this, we first obtain the CC flag via `nvidia-smi` within the testing container. @@ -77,4 +84,4 @@ However, some non-obvious choices and configurations are listed here. - One token used to fetch and read/write runner information. - **Expires 11 April 2026** - One token used fetch and read repository information via the API. - - **Expires 2 May 2025** + - **Expires 6 May 2026** diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 1a267198d..955a0cdff 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -12,17 +12,18 @@ To do this, testing is initialized via the top-level workflow, `MAM4xx Autoteste #### GPU-based Testing -| Test Name | GPU Brand | GPU Type | Microarchitecture | Compute Capability | Machine | Compilers | -| --------------------------------- | --------- | -------- | ---------------- | ------------------ | ------- | ---------------------------- | -| GPU AT2 gcc 12.3 cuda 12.1 | NVIDIA | H100 | Hopper | 9.0 | blake | `gcc` 12.3.0/`nvcc` 12.1.105 | +| Test Name | GPU Brand | GPU Type | Microarchitecture | Compute Capability | Machine | OS | Compilers | +| --------------------------------- | --------- | ------------| ----------------- | ------------------ | ------- | ------ | -------------------------------- | +| GPU AT2 gcc 12.3 cuda 12.1 | NVIDIA | H100 | Hopper | 9.0 | blake | RHEL8 | `gcc` 12.3.0/`nvcc` 12.1.105 | +| GPU AT2 gcc 13.3 hip 6.2 | AMD | MI250/MI210 | AMD_GFX90A | N/A | caraway | RHEL9 | `gcc` 13.3.0/`hipcc` 6.2.41133-0 | #### CPU-based Testing -**Note:** These are the current specs for GitHub's Ubuntu 22.04 runner and are subject to change. +**Note:** These are the *current* specs for GitHub's Ubuntu 22.04 runner and are subject to change. -| Test Name | OS | Machine | Compiler | -| -------------------------------------------- | -------------------- | -------------- | ---------- | -| GitHub CPU Auto-test Ubuntu 22.04[^gh-ubu2204] | Linux - Ubuntu 22.04 | GitHub Runners | `gcc` 12.3 | +| Test Name | OS | Machine | Compiler | +| --------------------------------------- | -------------------- | -------------- | ---------- | +| CPU GH-runner Ubuntu 22.04[^gh-ubu2204] | Linux - Ubuntu 22.04 | GitHub Runners | `gcc` 12.3 | ### The Flow of the CI Workflow @@ -48,6 +49,13 @@ Based on the trigger and/or inputs, `MAM4xx Autotester` dispatches sub-workflows - ***Note:*** AT2 = "Autotester 2," the second generation of a Sandia-developed GitHub-based testing product. - See the [AT2 README](./AT2-README.md) for details about the implementation of the AT2 product. +#### GPU AT2 `gcc` 13.3 `hip` 6.2 + +- This is largely identical to the above CUDA-based workflow, the salient difference being that we run on AMD hardware, using the `hipcc` C++ compiler. +- The `caraway` machine has 2 different AMD_GFX90A-architecture MI200-series GPUs available, MI210 and MI250. +- As of the time of writing, autotesting jobs are assigned one or the other based on availability, to speed up matters. + - ***Note:*** This could change based on future needs. + #### GitHub CPU Auto-test Ubuntu 22.04 - The full version of this test runs a "matrix-strategy" test running all combinations of @@ -86,6 +94,7 @@ The current options when manually triggering a workflow are: - Test Machine Architecture - Current Options: - `GPU-NVIDIA_H100` + - `GPU-AMD_MI200-series` - `CPU-Ubuntu_22-04` - `ALL` - Floating-point Precision @@ -135,7 +144,7 @@ Refer to the section on [Other Types of Job Control](./AT2-README.md#other-types - [x] Unify all CI into a single top-level yaml file that calls the sub-cases. - This should provide finer control over what runs and when. - @mjschmidt271 -- [ ] Add testing for AMD GPUs on `caraway`. +- [x] Add testing for AMD GPUs on `caraway`. - @jaelynlitz - WIP ### Low-priority