Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions .github/workflows/ghci-snl-testing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
name: ghci-snl-testing

on:
# Runs on PRs against master
pull_request:
branches: [ master ]
types: [opened, synchronize, ready_for_review, reopened]

# Manual run is used to bless
workflow_dispatch:
inputs:
job_to_run:
description: 'Job to run'
required: true
type: choice
options:
- gcc-openmp
- gcc-cuda
- all

# Add schedule trigger for nightly runs at midnight MT (Standard Time)
schedule:
- cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time

concurrency:
# Two runs are in the same group if they are testing the same git ref
# - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
# - for other triggers, the ref is the branch tested
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

# WARNING: if you change machines where this workflow can run, you may have to adjust the
# location of the certificate file (possibly on a per-job basis). For now, since
# all ghci-snl-XYZ machines have the same certificates, this can be set at
# the workflow level
env:
NODE_EXTRA_CA_CERTS: /etc/pki/tls/certs/ca-bundle.crt

jobs:
gcc-openmp:
if: |
${{
github.event_name != 'workflow_dispatch' ||
(
github.event.inputs.job_to_run == 'gcc-openmp' ||
github.event.inputs.job_to_run == 'all'
)
}}
runs-on: [self-hosted, ghci-snl-cpu, gcc]
strategy:
fail-fast: false
matrix:
build_type: [debug, release]
name: gcc-openmp / ${{ matrix.build_type }}
steps:
- name: Show action trigger
uses: actions/github-script@v7
with:
script: |
const eventName = context.eventName;
const actor = context.actor || 'unknown'; // Default to 'unknown' if actor is not defined
let eventAction = 'N/A';

// Determine the event action based on the event type
if (eventName === 'pull_request') {
eventAction = context.payload.action || 'N/A';
} else if (eventName === 'pull_request_review') {
eventAction = context.payload.review.state || 'N/A';
} else if (eventName === 'workflow_dispatch') {
eventAction = 'manual trigger';
} else if (eventName === 'schedule') {
eventAction = 'scheduled trigger';
}
console.log(`The job was triggered by a ${eventName} event.`);
console.log(` - Event action: ${eventAction}`);
console.log(` - Triggered by: ${actor}`);
- name: Check out the repository
uses: actions/checkout@v4
with:
persist-credentials: false
show-progress: false
submodules: recursive
- name: Ensure CACTS is installed
run: |
python3 -m pip install --user --upgrade --trusted-host pypi.org cacts
- name: Run tests
run: |
cmd="cacts -m ghci-snl-cpu -t ${{ matrix.build_type }} -r ./"
echo "CACTS call: $cmd"
$cmd
- name: Upload files
if: always()
uses: actions/upload-artifact@v4
with:
name: log-files-${{ matrix.build_type }}-ghci-snl-cpu
path: |
ctest-build/*/Testing/Temporary/Last*.log
ctest-build/*/ctest_resource_file.json
ctest-build/*/ctest_script.cmake
ctest-build/*/CMakeCache.txt
gcc-cuda:
if: |
${{
github.event_name != 'workflow_dispatch' ||
(
github.event.inputs.job_to_run == 'gcc-cuda' ||
github.event.inputs.job_to_run == 'all'
)
}}
runs-on: [self-hosted, ghci-snl-cuda, cuda, gcc]
strategy:
fail-fast: false
matrix:
build_type: [debug, release]
name: gcc-cuda / ${{ matrix.build_type }}
steps:
- name: Show action trigger
uses: actions/github-script@v7
with:
script: |
const eventName = context.eventName;
const actor = context.actor || 'unknown'; // Default to 'unknown' if actor is not defined
let eventAction = 'N/A';

// Determine the event action based on the event type
if (eventName === 'pull_request') {
eventAction = context.payload.action || 'N/A';
} else if (eventName === 'pull_request_review') {
eventAction = context.payload.review.state || 'N/A';
} else if (eventName === 'workflow_dispatch') {
eventAction = 'manual trigger';
} else if (eventName === 'schedule') {
eventAction = 'scheduled trigger';
}
console.log(`The job was triggered by a ${eventName} event.`);
console.log(` - Event action: ${eventAction}`);
console.log(` - Triggered by: ${actor}`);
- name: Check out the repository
uses: actions/checkout@v4
with:
persist-credentials: false
show-progress: false
submodules: recursive
- name: Get CUDA Arch
run: |
# Ensure nvidia-smi is available
if ! command -v nvidia-smi &> /dev/null; then
echo "nvidia-smi could not be found. Please ensure you have Nvidia drivers installed."
exit 1
fi

# Get the GPU model from nvidia-smi, and set env for next step
gpu_model=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1)
case "$gpu_model" in
*"H100"*)
echo "ARCH=HOPPER90" >> $GITHUB_ENV
;;
*"A100"*)
echo "ARCH=AMPERE80" >> $GITHUB_ENV
;;
*"V100"*)
echo "ARCH=VOLTA70" >> $GITHUB_ENV
;;
*)
echo "Unsupported GPU model: $gpu_model"
exit 1
;;
esac
- name: Ensure CACTS is installed
run: |
python3 -m pip install --user --upgrade --trusted-host pypi.org cacts
- name: Run tests
run: |
cmd="cacts -m ghci-snl-cuda -t ${{ matrix.build_type }} -r ./ -c Kokkos_ARCH_${{ env.ARCH }}=ON"
echo "CACTS call: $cmd"
$cmd
- name: Upload files
if: always()
uses: actions/upload-artifact@v4
with:
name: log-files-${{ matrix.build_type }}-ghci-snl-cuda
path: |
ctest-build/*/Testing/Temporary/Last*.log
ctest-build/*/ctest_resource_file.json
ctest-build/*/ctest_script.cmake
ctest-build/*/CMakeCache.txt
114 changes: 114 additions & 0 deletions cacts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Configuration file for CACTS
#
# There are three main sections: project, machines build_types
# - project: contains basic info on the project
# - machines: contains a list of machines on which testing is allowed
# - configurations: contains a list of build types that can be built
#
# The machines and configurations sections CAN contain an entry "default", which
# defines some defaults for all machines/build_types. Other entries will OVERWRITE anything
# that is also set in the default entry. It is recommended to keep the default
# entry, since it can be used to list ALL possible settings, for documentation purposes.
#
# Upon parsing the yaml file, CACTS will create one Project, one Machine, and one or
# more BuildType objects. These objects will contain members with *the same* name as the
# configs in the yaml file. Notice the settings names are hard-coded, so you can't add
# a new setting and hope that it gets set in the object.
#
# The objects settings CAN be used in the yaml file to programmatically set other options.
# For instance, a build type can use properties of the project/machine to set a cmake var.
# The syntax is ${<obj>.<prop>}, where <obj> is 'project', 'machine', or 'build', and
# and <prop> must be a valid attribute of the corresponding object (see the
# corresponding py files for valid options). If you use the ${..} syntax,
# we recommend that you wrap the entry in quotes, to avoid any surprise with YAML parsers.
# The ${..} syntax is actually more powerful than that, and can perform any python operation,
# with some restriction (e.g., imports or tinkering with global vars is prohibited,
# for security purposes.
#
# In addition to the ${..} syntax, CACTS also supports bash commands evaluation,
# with the syntax $(cmd). This can be used in conjunction with ${}. E.g., one can do
# NetCDF_Fortran_ROOT: $(${machine.env_setup} && nf-config --prefix)
# Python expressions ${..} are always evaluated first, bash expressions $(..) are
# evaluated afterwards.

##########################################################################################
# PROJECT SETTINGS #
##########################################################################################

project:
name: EKAT
# NOTE: CACTS will also set project.root_dir at runtime, so you can actually use
# ${project.root_dir} in the machines/configurations sections

##########################################################################################
# MACHINES #
##########################################################################################

machines:
# CACTS will also set an entry machine.name, where the value of name matches the yaml map section name
default:
cxx_compiler: mpicxx
c_compiler: mpicc
ftn_compiler: mpifort
mach_file: "${str(project.root_dir) + '/cmake/machine-files/' + machine.name + '.cmake'}"
gpu_arch: null
batch: null
num_bld_res: null
num_run_res: null
valg_supp_file: null
node_regex: null

mappy:
env_setup:
- 'module purge'
- 'module load sems-cmake/3.27.9 sems-git/2.42.0 sems-gcc/11.4.0 sems-openmpi-no-cuda/4.1.6'
valg_supp_file: "${project.root_dir}/scripts/jenkins/valgrind/mappy.supp"

weaver:
env_setup:
- "source /etc/profile.d/modules.sh"
- "module purge"
- "module load cmake/3.25.1 git/2.39.1 python/3.10.8 gcc/11.3.0 cuda/11.8.0 openmpi"

batch: "bsub -I -q rhel8 -n 4 -gpu num=4"
num_run_res: 4 # four gpus
gpu_arch: "cuda"

ghci-snl-cpu: {}

ghci-snl-cuda:
gpu_arch: "cuda"
num_run_res: 2

#########################################################################################
# BUILDS CONFIGURATIONS #
#########################################################################################

configurations:
# CACTS will also set an entry build.name, where the value of name matches the yaml map section name
default:
longname: null # If not set, will default to build.name
description: null
uses_baselines: False
on_by_default: True
cmake_args:
EKAT_ENABLE_ALL_PACKAGES: True
EKAT_TEST_THREAD_INC: ${2 if machine.gpu_arch is None else 1}
EKAT_TEST_MAX_THREADS: ${machine.num_run_res if machine.gpu_arch is None else 1}
EKAT_DISABLE_TPL_WARNINGS: True
EKAT_DEFAULT_BFB: True
EKAT_TEST_DOUBLE_PRECISION: True
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided that testing both SP and DP in the same build was just simpler. Users can of course override these setting if running locally via -c EKAT_TEST_DOUBLE_PRECISION=OFF.

EKAT_TEST_SINGLE_PRECISION: True
EKAT_SKIP_FIND_YAML_CPP: True

debug:
longname: debug
description: "debug build with both double and single precision testing"
cmake_args:
CMAKE_BUILD_TYPE: Debug
Kokkos_ENABLE_DEBUG_BOUNDS_CHECK: True
release:
longname: release
description: "release build with both double and single precision testing"
cmake_args:
CMAKE_BUILD_TYPE: Release
5 changes: 5 additions & 0 deletions cmake/machine-files/ghci-snl-cpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Common settings for our ghci images
include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake)

# Currently, we have 32 cores for each ghci-snl-cpu instance, but 4 ranks is enough
set(EKAT_TEST_MAX_RANKS 4 CACHE STRING "Upper limit on ranks for mpi tests")
10 changes: 10 additions & 0 deletions cmake/machine-files/ghci-snl-cuda.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Common settings for our ghci images
include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake)

# Enable CUDA in kokkos
include (${CMAKE_CURRENT_LIST_DIR}/kokkos/cuda.cmake)

set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks")

# Currently, we have 2 GPUs/node on Blake, and we run a SINGLE build per node, so we can fit 2 ranks there
set(EKAT_TEST_MAX_RANKS 2 CACHE STRING "Upper limit on ranks for mpi tests")
9 changes: 9 additions & 0 deletions cmake/machine-files/ghci-snl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Let's catch usage of code deprecated in Kokkos 4
option (Kokkos_ENABLE_DEPRECATED_CODE_4 "" OFF)

# We need to manage resources to spread across available cores/gpus
option (EKAT_TEST_LAUNCHER_MANAGE_RESOURCES "" ON)

# Needed by EkatCreateUnitTest
set (EKAT_MPIRUN_EXE "mpirun" CACHE STRING "")
set (EKAT_MPI_NP_FLAG "-n" CACHE STRING "")
31 changes: 0 additions & 31 deletions scripts/jenkins/jenkins.sh

This file was deleted.

Loading