-
Notifications
You must be signed in to change notification settings - Fork 9
Enable testing with CACTS, and add gh actions to use it #374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
5987c28
Add ghci-snl* machine files
bartgol 93b467e
Add cacts config file
bartgol f678a0b
Replace jenkins scripts with gh workflows
bartgol b9e7299
Fix certificate issue in ghci-snl-testing
bartgol 952211d
Fix mach files options copied from eamxx repo
bartgol File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
name: ghci-snl-testing | ||
|
||
on: | ||
# Runs on PRs against master | ||
pull_request: | ||
branches: [ master ] | ||
types: [opened, synchronize, ready_for_review, reopened] | ||
|
||
# Manual run is used to bless | ||
workflow_dispatch: | ||
inputs: | ||
job_to_run: | ||
description: 'Job to run' | ||
required: true | ||
type: choice | ||
options: | ||
- gcc-openmp | ||
- gcc-cuda | ||
- all | ||
|
||
# Add schedule trigger for nightly runs at midnight MT (Standard Time) | ||
schedule: | ||
- cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time | ||
|
||
concurrency: | ||
# Two runs are in the same group if they are testing the same git ref | ||
# - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge | ||
# - for other triggers, the ref is the branch tested | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
# WARNING: if you change machines where this workflow can run, you may have to adjust the | ||
# location of the certificate file (possibly on a per-job basis). For now, since | ||
# all ghci-snl-XYZ machines have the same certificates, this can be set at | ||
# the workflow level | ||
env: | ||
NODE_EXTRA_CA_CERTS: /etc/pki/tls/certs/ca-bundle.crt | ||
|
||
jobs: | ||
gcc-openmp: | ||
if: | | ||
${{ | ||
github.event_name != 'workflow_dispatch' || | ||
( | ||
github.event.inputs.job_to_run == 'gcc-openmp' || | ||
github.event.inputs.job_to_run == 'all' | ||
) | ||
}} | ||
runs-on: [self-hosted, ghci-snl-cpu, gcc] | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
build_type: [debug, release] | ||
name: gcc-openmp / ${{ matrix.build_type }} | ||
steps: | ||
- name: Show action trigger | ||
uses: actions/github-script@v7 | ||
with: | ||
script: | | ||
const eventName = context.eventName; | ||
const actor = context.actor || 'unknown'; // Default to 'unknown' if actor is not defined | ||
let eventAction = 'N/A'; | ||
|
||
// Determine the event action based on the event type | ||
if (eventName === 'pull_request') { | ||
eventAction = context.payload.action || 'N/A'; | ||
} else if (eventName === 'pull_request_review') { | ||
eventAction = context.payload.review.state || 'N/A'; | ||
} else if (eventName === 'workflow_dispatch') { | ||
eventAction = 'manual trigger'; | ||
} else if (eventName === 'schedule') { | ||
eventAction = 'scheduled trigger'; | ||
} | ||
console.log(`The job was triggered by a ${eventName} event.`); | ||
console.log(` - Event action: ${eventAction}`); | ||
console.log(` - Triggered by: ${actor}`); | ||
- name: Check out the repository | ||
uses: actions/checkout@v4 | ||
with: | ||
persist-credentials: false | ||
show-progress: false | ||
submodules: recursive | ||
- name: Ensure CACTS is installed | ||
run: | | ||
python3 -m pip install --user --upgrade --trusted-host pypi.org cacts | ||
- name: Run tests | ||
run: | | ||
cmd="cacts -m ghci-snl-cpu -t ${{ matrix.build_type }} -r ./" | ||
echo "CACTS call: $cmd" | ||
$cmd | ||
- name: Upload files | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: log-files-${{ matrix.build_type }}-ghci-snl-cpu | ||
path: | | ||
ctest-build/*/Testing/Temporary/Last*.log | ||
ctest-build/*/ctest_resource_file.json | ||
ctest-build/*/ctest_script.cmake | ||
ctest-build/*/CMakeCache.txt | ||
gcc-cuda: | ||
if: | | ||
${{ | ||
github.event_name != 'workflow_dispatch' || | ||
( | ||
github.event.inputs.job_to_run == 'gcc-cuda' || | ||
github.event.inputs.job_to_run == 'all' | ||
) | ||
}} | ||
runs-on: [self-hosted, ghci-snl-cuda, cuda, gcc] | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
build_type: [debug, release] | ||
name: gcc-cuda / ${{ matrix.build_type }} | ||
steps: | ||
- name: Show action trigger | ||
uses: actions/github-script@v7 | ||
with: | ||
script: | | ||
const eventName = context.eventName; | ||
const actor = context.actor || 'unknown'; // Default to 'unknown' if actor is not defined | ||
let eventAction = 'N/A'; | ||
|
||
// Determine the event action based on the event type | ||
if (eventName === 'pull_request') { | ||
eventAction = context.payload.action || 'N/A'; | ||
} else if (eventName === 'pull_request_review') { | ||
eventAction = context.payload.review.state || 'N/A'; | ||
} else if (eventName === 'workflow_dispatch') { | ||
eventAction = 'manual trigger'; | ||
} else if (eventName === 'schedule') { | ||
eventAction = 'scheduled trigger'; | ||
} | ||
console.log(`The job was triggered by a ${eventName} event.`); | ||
console.log(` - Event action: ${eventAction}`); | ||
console.log(` - Triggered by: ${actor}`); | ||
- name: Check out the repository | ||
uses: actions/checkout@v4 | ||
with: | ||
persist-credentials: false | ||
show-progress: false | ||
submodules: recursive | ||
- name: Get CUDA Arch | ||
run: | | ||
# Ensure nvidia-smi is available | ||
if ! command -v nvidia-smi &> /dev/null; then | ||
echo "nvidia-smi could not be found. Please ensure you have Nvidia drivers installed." | ||
exit 1 | ||
fi | ||
|
||
# Get the GPU model from nvidia-smi, and set env for next step | ||
gpu_model=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1) | ||
case "$gpu_model" in | ||
*"H100"*) | ||
echo "ARCH=HOPPER90" >> $GITHUB_ENV | ||
;; | ||
*"A100"*) | ||
echo "ARCH=AMPERE80" >> $GITHUB_ENV | ||
;; | ||
*"V100"*) | ||
echo "ARCH=VOLTA70" >> $GITHUB_ENV | ||
;; | ||
*) | ||
echo "Unsupported GPU model: $gpu_model" | ||
exit 1 | ||
;; | ||
esac | ||
- name: Ensure CACTS is installed | ||
run: | | ||
python3 -m pip install --user --upgrade --trusted-host pypi.org cacts | ||
- name: Run tests | ||
run: | | ||
cmd="cacts -m ghci-snl-cuda -t ${{ matrix.build_type }} -r ./ -c Kokkos_ARCH_${{ env.ARCH }}=ON" | ||
echo "CACTS call: $cmd" | ||
$cmd | ||
- name: Upload files | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: log-files-${{ matrix.build_type }}-ghci-snl-cuda | ||
path: | | ||
ctest-build/*/Testing/Temporary/Last*.log | ||
ctest-build/*/ctest_resource_file.json | ||
ctest-build/*/ctest_script.cmake | ||
ctest-build/*/CMakeCache.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Configuration file for CACTS | ||
# | ||
# There are three main sections: project, machines build_types | ||
# - project: contains basic info on the project | ||
# - machines: contains a list of machines on which testing is allowed | ||
# - configurations: contains a list of build types that can be built | ||
# | ||
# The machines and configurations sections CAN contain an entry "default", which | ||
# defines some defaults for all machines/build_types. Other entries will OVERWRITE anything | ||
# that is also set in the default entry. It is recommended to keep the default | ||
# entry, since it can be used to list ALL possible settings, for documentation purposes. | ||
# | ||
# Upon parsing the yaml file, CACTS will create one Project, one Machine, and one or | ||
# more BuildType objects. These objects will contain members with *the same* name as the | ||
# configs in the yaml file. Notice the settings names are hard-coded, so you can't add | ||
# a new setting and hope that it gets set in the object. | ||
# | ||
# The objects settings CAN be used in the yaml file to programmatically set other options. | ||
# For instance, a build type can use properties of the project/machine to set a cmake var. | ||
# The syntax is ${<obj>.<prop>}, where <obj> is 'project', 'machine', or 'build', and | ||
# and <prop> must be a valid attribute of the corresponding object (see the | ||
# corresponding py files for valid options). If you use the ${..} syntax, | ||
# we recommend that you wrap the entry in quotes, to avoid any surprise with YAML parsers. | ||
# The ${..} syntax is actually more powerful than that, and can perform any python operation, | ||
# with some restriction (e.g., imports or tinkering with global vars is prohibited, | ||
# for security purposes. | ||
# | ||
# In addition to the ${..} syntax, CACTS also supports bash commands evaluation, | ||
# with the syntax $(cmd). This can be used in conjunction with ${}. E.g., one can do | ||
# NetCDF_Fortran_ROOT: $(${machine.env_setup} && nf-config --prefix) | ||
# Python expressions ${..} are always evaluated first, bash expressions $(..) are | ||
# evaluated afterwards. | ||
|
||
########################################################################################## | ||
# PROJECT SETTINGS # | ||
########################################################################################## | ||
|
||
project: | ||
name: EKAT | ||
# NOTE: CACTS will also set project.root_dir at runtime, so you can actually use | ||
# ${project.root_dir} in the machines/configurations sections | ||
|
||
########################################################################################## | ||
# MACHINES # | ||
########################################################################################## | ||
|
||
machines: | ||
# CACTS will also set an entry machine.name, where the value of name matches the yaml map section name | ||
default: | ||
cxx_compiler: mpicxx | ||
c_compiler: mpicc | ||
ftn_compiler: mpifort | ||
mach_file: "${str(project.root_dir) + '/cmake/machine-files/' + machine.name + '.cmake'}" | ||
gpu_arch: null | ||
batch: null | ||
num_bld_res: null | ||
num_run_res: null | ||
valg_supp_file: null | ||
node_regex: null | ||
|
||
mappy: | ||
env_setup: | ||
- 'module purge' | ||
- 'module load sems-cmake/3.27.9 sems-git/2.42.0 sems-gcc/11.4.0 sems-openmpi-no-cuda/4.1.6' | ||
valg_supp_file: "${project.root_dir}/scripts/jenkins/valgrind/mappy.supp" | ||
|
||
weaver: | ||
env_setup: | ||
- "source /etc/profile.d/modules.sh" | ||
- "module purge" | ||
- "module load cmake/3.25.1 git/2.39.1 python/3.10.8 gcc/11.3.0 cuda/11.8.0 openmpi" | ||
|
||
batch: "bsub -I -q rhel8 -n 4 -gpu num=4" | ||
num_run_res: 4 # four gpus | ||
gpu_arch: "cuda" | ||
|
||
ghci-snl-cpu: {} | ||
|
||
ghci-snl-cuda: | ||
gpu_arch: "cuda" | ||
num_run_res: 2 | ||
|
||
######################################################################################### | ||
# BUILDS CONFIGURATIONS # | ||
######################################################################################### | ||
|
||
configurations: | ||
# CACTS will also set an entry build.name, where the value of name matches the yaml map section name | ||
default: | ||
longname: null # If not set, will default to build.name | ||
description: null | ||
uses_baselines: False | ||
on_by_default: True | ||
cmake_args: | ||
EKAT_ENABLE_ALL_PACKAGES: True | ||
EKAT_TEST_THREAD_INC: ${2 if machine.gpu_arch is None else 1} | ||
EKAT_TEST_MAX_THREADS: ${machine.num_run_res if machine.gpu_arch is None else 1} | ||
EKAT_DISABLE_TPL_WARNINGS: True | ||
EKAT_DEFAULT_BFB: True | ||
EKAT_TEST_DOUBLE_PRECISION: True | ||
EKAT_TEST_SINGLE_PRECISION: True | ||
EKAT_SKIP_FIND_YAML_CPP: True | ||
|
||
debug: | ||
longname: debug | ||
description: "debug build with both double and single precision testing" | ||
cmake_args: | ||
CMAKE_BUILD_TYPE: Debug | ||
Kokkos_ENABLE_DEBUG_BOUNDS_CHECK: True | ||
release: | ||
longname: release | ||
description: "release build with both double and single precision testing" | ||
cmake_args: | ||
CMAKE_BUILD_TYPE: Release |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Common settings for our ghci images | ||
include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake) | ||
|
||
# Currently, we have 32 cores for each ghci-snl-cpu instance, but 4 ranks is enough | ||
set(EKAT_TEST_MAX_RANKS 4 CACHE STRING "Upper limit on ranks for mpi tests") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Common settings for our ghci images | ||
include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake) | ||
|
||
# Enable CUDA in kokkos | ||
include (${CMAKE_CURRENT_LIST_DIR}/kokkos/cuda.cmake) | ||
|
||
set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks") | ||
|
||
# Currently, we have 2 GPUs/node on Blake, and we run a SINGLE build per node, so we can fit 2 ranks there | ||
set(EKAT_TEST_MAX_RANKS 2 CACHE STRING "Upper limit on ranks for mpi tests") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Let's catch usage of code deprecated in Kokkos 4 | ||
option (Kokkos_ENABLE_DEPRECATED_CODE_4 "" OFF) | ||
|
||
# We need to manage resources to spread across available cores/gpus | ||
option (EKAT_TEST_LAUNCHER_MANAGE_RESOURCES "" ON) | ||
|
||
# Needed by EkatCreateUnitTest | ||
set (EKAT_MPIRUN_EXE "mpirun" CACHE STRING "") | ||
set (EKAT_MPI_NP_FLAG "-n" CACHE STRING "") |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I decided that testing both SP and DP in the same build was just simpler. Users can of course override these setting if running locally via
-c EKAT_TEST_DOUBLE_PRECISION=OFF
.