diff --git a/.gitmodules b/.gitmodules index 6cb4b45..5a42503 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,9 @@ [submodule "2025-HPCIC/tutorial-code/thicket-tutorial"] path = 2025-HPCIC/tutorial-code/thicket-tutorial url = https://github.com/llnl/thicket-tutorial +[submodule "2025-eScience/tutorial-code/thicket-tutorial"] + path = 2025-eScience/tutorial-code/thicket-tutorial + url = https://github.com/llnl/thicket-tutorial +[submodule "2025-eScience/tutorial-code/caliper-tutorial"] + path = 2025-eScience/tutorial-code/caliper-tutorial + url = https://github.com/daboehme/caliper-tutorial.git diff --git a/2025-eScience/README.rst b/2025-eScience/README.rst new file mode 100644 index 0000000..52ab8e8 --- /dev/null +++ b/2025-eScience/README.rst @@ -0,0 +1,139 @@ +====================== +eScience 2025 Tutorial +====================== + +This directory contains the materials for the eScience 2025 tutorial. The following subsections go over the contains of the material. + +-------- +Contents +-------- + +^^^^^^^^^^^^^ +Tutorial Code +^^^^^^^^^^^^^ + +The code elements of this tutorial (e.g., Jupyter notebooks, command-line scripts, Markdown/RST instruction files) can all be found in the :code:`tutorial-code` subdirectory. If materials are actually stored in other git repositories, they can be accessed from this subdirectory +via a git submodule. + +^^^^^^ +Slides +^^^^^^ + +The slides used in presenting this tutorial can be found in the :code:`slides` subdirectory. + +^^^^^^ +Docker +^^^^^^ + +The Docker definition files (i.e., Dockerfiles) for all the necessary containers can be found in the :code:`docker` subdirectory. There are currently 5 definition files: + +1. :code:`Dockerfile.caliper`: builds Caliper and Adiak on top of the :code:`ubuntu/noble` image from DockerHub +2. :code:`Dockerfile.thicket`: build Thicket on top of the image produced by :code:`Dockerfile.caliper` +3. :code:`Dockerfile.benchpark`: download and bootstrap Benchpark on top of the image produced by :code:`Dockerfile.benchpark` +4. :code:`Dockerfile.spawn`: download tutorial materials, download any remaining necessary packages, and do other setup work on top of the image produced by :code:`Dockerfile.benchpark` +5. :code:`Dockerfile.init`: ensure user permissions are correct using the super-minimal :code:`alpine/git` image from DockerHub + +""""""""""""""""""""""""""""""""""""""" +Testing the Builds of the Docker Images +""""""""""""""""""""""""""""""""""""""" + +To enable automated testing of the Docker images, all edits to the Dockerfiles above should be done in a branch with an open PR. When a PR is open, a GitHub Actions CI will +run and ensure that the images can be built. To properly configure the CI, edit the :code:`github_ci_matrix.json` file in the root of this repository as follows: + +1. Edit the "tag" field to be the tag (i.e., version) of the Docker images you will be generating +2. Edit the "tutorial_dir" field to the name of this directory + +The CI reads :code:`github_ci_matrix.json` to get values shared by the matrices of all GitHub Actions jobs. + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Pushing the Docker Images to GitHub Container Registry (GHCR) +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +Before trying to push to GHCR, someone with the necessary permissions should make sure this repo can push to these images in GHCR (**change names when we decide on appropriate ones**): + +* ghcr.io/llnl/caliper +* ghcr.io/llnl/thicket +* ghcr.io/llnl/benchpark +* ghcr.io/llnl/reproducible-benchmarking-spawn +* ghcr.io/llnl/reproducible-benchmarking-init + +If these images do not yet exist, your first push will properly set the permissions. If these images do exist, follow the instructions +`here `_ +to add this repository to each package. Make sure to grant "Write" permissions to the repository while doing this. + +After ensuring this repository has the necessary permissions, to push the Docker images to GHCR, follow these steps: + +1. Make sure all changes to the Dockerfiles have been merged into the :code:`main` branch +2. From the GitHub webpage, navigate to the "Actions" tab +3. On the left of the resulting page, click on "Build containers and push to GHCR" +4. Click on the "Run workflow" button to the right of the page +5. In the popup menu that appears, select the "main" branch and fill out the requested information +6. Click the green "Run workflow" button to start the process and building and pushing images + +^^^^^^^^^^^^^^ +Infrastructure +^^^^^^^^^^^^^^ + +All the infrastructure needed to deploy the tutorial to a Kubernetes cluster with JupyterHub is contained in the :code:`infrastructure` subdirectory. +This infrastructure is generated by the tool `here `_. +The infrastructure can be regenerated as-is using :code:`infrastructure/config.toml`. + +---------------------------- +Testing the Tutorial Locally +---------------------------- + +To test the tutorial locally, you first need to build all the Docker images except the init image. Before building, +keep in mind the following dependencies between images: + +.. code-block:: + + ghcr.io/llnl/caliper --> ghcr.io/llnl/thicket --> ghcr.io/llnl/benchpark --> ghcr.io/llnl/reproducible-benchmarking-spawn + +Because of these dependencies, the first thing you should figure out is which (if any) images you need to build locally. +If a Dockerfile has changes that are **not** on GHCR, you will need to build that image *and all downstream images (based on the flowchart above)* +locally before testing. To build an image locally, run the following from this directory (**not the** :code:`docker` **directory**): + +.. code-block:: bash + + $ docker build -t -f ./docker/ . # Note the trailing "." + +In the command above, :code:`` should be one of the GHCR URLs above, followed by a colon, followed by a tag. It could look something +like :code:`ghcr.io/llnl/benchpark:escience-2025`. Note that :code:`` **must** match the value of the :code:`FROM` directive +for the dependent image. For example, to get the :code:`` field for :code:`ghcr.io/llnl/benchpark`, look for the :code:`FROM` directive +in :code:`./docker/Dockerfile.spawn`. + +If all the changes to the corresponding Dockerfiles in :code:`docker` have already been pushed to GHCR, you do not need to build locally. +Instead, you should just pull the spawn image using: + +.. code-block:: bash + + $ docker pull ghcr.io/llnl/reproducible-benchmarking-spawn: + +You should replace :code:`` in the command above with the GHCR tag of the image you want to pull. + +After you have a built spawn image (either by building locally or by pulling from GHCR), you can run the spawn image locally +by running the following command: + +.. code-block:: bash + + $ docker run --rm -it --entrypoint --name reproducible_benchmark_tutorial_local -p 8888:8888 + +In the command above, :code:`` is the name of the built spawn image. If you built that image locally, this argument +should match the value you passed to the :code:`-t` flag of :code:`docker build` when building the spawn image. If you pulled the image +from GHCR, this argument should be :code:`ghcr.io/llnl/reproducible-benchmarking-spawn:`. + +The :code:`` field in the command above dictates what command runs within the container immediately after startup. +It can be one of three values: + +1. :code:`/local-entrypoint.sh`: this entrypoint script will start a JupyterLab instance and make it available from outside the container. +2. :code:`/entrypoint.sh`: this entrypoint script will run :code:`jupyterhub-singleuser`. It is intended for use in the cloud JupyterHub deployment and should not be used locally. +3. :code:`bash`: by specifying :code:`bash` (or any other shell installed in the container), you will get command-line access to the container, instead of a Jupyter environment. + +At this point, you should either have a Jupyter URL that you can use to access Jupyter, or you should have shell access to the container. +You can now do whatever local testing you want of the image. + +------------------------------------ +Deploying the Tutorial to Kubernetes +------------------------------------ + +TBA diff --git a/2025-eScience/docker/Dockerfile.benchpark b/2025-eScience/docker/Dockerfile.benchpark new file mode 100644 index 0000000..817d416 --- /dev/null +++ b/2025-eScience/docker/Dockerfile.benchpark @@ -0,0 +1,58 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# For testing +# FROM test-thicket + +FROM ghcr.io/llnl/thicket:hpcic-2025 + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + gzip \ + lsb-release \ + patch \ + tar \ + unzip \ + xz-utils \ + zstd \ + bzip2 \ + liblapack-dev \ + libblas-dev \ + && rm -rf /var/lib/apt/lists/* + +SHELL [ "/bin/bash", "-c" ] + +USER ${NB_USER} + +RUN git clone https://github.com/LLNL/benchpark.git ${HOME}/benchpark && \ + cd ${HOME}/benchpark && \ + git checkout -b develop-2025-08-25 develop-2025-08-25 && \ + git submodule update --init --recursive + +USER root + +RUN . /opt/global_py_venv/bin/activate && \ + python3 -m pip install -r ${HOME}/benchpark/requirements.txt + +RUN echo 'export PATH=${HOME}/benchpark/bin:$PATH' >> ${HOME}/.bashrc + +RUN echo 'export PATH=${HOME}/benchpark/bin:$PATH' >> ${HOME}/.bash_profile + +RUN chmod -R 777 ~/ ${HOME} + +WORKDIR ${HOME} + +RUN mkdir -p ${HOME}/.local/share && \ + chmod 777 ${HOME}/.local/share + +USER ${NB_USER} + +# Run this to trigger bootstrap +RUN . /opt/global_py_venv/bin/activate && \ + ${HOME}/benchpark/bin/benchpark bootstrap diff --git a/2025-eScience/docker/Dockerfile.caliper b/2025-eScience/docker/Dockerfile.caliper new file mode 100644 index 0000000..77c7f78 --- /dev/null +++ b/2025-eScience/docker/Dockerfile.caliper @@ -0,0 +1,131 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# FROM ubuntu:noble +FROM fluxrm/flux-sched:jammy + +# ubuntu:noble added a new 'ubuntu' user in the container. +# Get rid of it! +# RUN userdel -r ubuntu + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + adduser \ + vim \ + nano \ + emacs \ + build-essential \ + cmake \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + git \ + util-linux \ + less \ + htop \ + zip \ + unzip \ + # NOTE: the flux-sched image already pulls and builds MPICH 4.2.2 + # WITHOUT PMIx support (this is important because PMIx is a pain, and + # requires extra setup with Flux). + # openmpi-bin \ + # openmpi-common \ + # libopenmpi-dev \ + && rm -rf /var/lib/apt/lists/* + +SHELL [ "/bin/bash", "-c" ] + +RUN python3 -m venv /opt/global_py_venv + +RUN . /opt/global_py_venv/bin/activate && \ + python3 -m pip install pybind11 + +ENV CALI_INSTALL_PREFIX=/usr \ + GIT_CLONE_STAGING_DIR=/tmp + +RUN git clone https://github.com/LLNL/Caliper.git ${GIT_CLONE_STAGING_DIR}/Caliper && \ + cd ${GIT_CLONE_STAGING_DIR}/Caliper && \ + git fetch origin && \ + git checkout v2.12.1 && \ + git submodule update --init --recursive && \ + git clone https://github.com/LLNL/Adiak.git ${GIT_CLONE_STAGING_DIR}/Adiak && \ + cd ${GIT_CLONE_STAGING_DIR}/Adiak && \ + git fetch origin && \ + git checkout v0.4.1 && \ + git submodule update --init --recursive + +RUN cd ${GIT_CLONE_STAGING_DIR}/Adiak && \ + mkdir build && \ + cd build && \ + cmake \ + -DENABLE_MPI=ON \ + -DCMAKE_C_COMPILER=$(which gcc) \ + -DCMAKE_CXX_COMPILER=$(which g++) \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_INSTALL_PREFIX=${CALI_INSTALL_PREFIX} \ + .. && \ + make -j 4 && \ + make install + +RUN . /opt/global_py_venv/bin/activate && \ + cd ${GIT_CLONE_STAGING_DIR}/Caliper && \ + mkdir build && \ + cd build && \ + cmake \ + -DWITH_TOOLS=ON \ + -DWITH_MPI=ON \ + -DWITH_ADIAK=ON \ + -DWITH_PYTHON_BINDINGS=ON \ + -Dpybind11_DIR=$(pybind11-config --cmakedir) \ + -DCMAKE_PREFIX_PATH=${CALI_INSTALL_PREFIX} \ + -DCMAKE_C_COMPILER=$(which gcc) \ + -DCMAKE_CXX_COMPILER=$(which g++) \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_INSTALL_PREFIX=${CALI_INSTALL_PREFIX} \ + .. && \ + make -j 4 && \ + make install + +RUN rm -rf ${GIT_CLONE_STAGING_DIR}/Caliper && rm -rf ${GIT_CLONE_STAGING_DIR}/Adiak + +ENV NB_USER=jovyan \ + NB_UID=1000 \ + HOME=/home/jovyan + +RUN adduser \ + --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + --home ${HOME} \ + --force-badname \ + ${NB_USER} + +# NOTE: this should NEVER be uncommented by the time we push to GHCR +RUN adduser ${NB_USER} sudo +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +RUN chmod -R 777 ~/ ${HOME} + +ENV SHELL=/usr/bin/bash + +RUN mkdir -p ${HOME}/.local/share && \ + chmod 777 ${HOME}/.local/share + +RUN echo $(flux env) + +RUN echo 'export PATH=/usr/bin:$PATH' >> ${HOME}/.bashrc && \ + echo '. /opt/global_py_venv/bin/activate' >> ${HOME}/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/lib:/usr/lib64:$LD_LIBRARY_PATH' >> ${HOME}/.bashrc + +RUN echo 'export PATH=/usr/bin:$PATH' >> ${HOME}/.bash_profile && \ + echo '. /opt/global_py_venv/bin/activate' >> ${HOME}/.bash_profile && \ + echo 'export LD_LIBRARY_PATH=/usr/lib:/usr/lib64:$LD_LIBRARY_PATH' >> ${HOME}/.bash_profile + +USER ${NB_USER} +WORKDIR ${HOME} \ No newline at end of file diff --git a/2025-eScience/docker/Dockerfile.hub b/2025-eScience/docker/Dockerfile.hub new file mode 100644 index 0000000..32247be --- /dev/null +++ b/2025-eScience/docker/Dockerfile.hub @@ -0,0 +1,3 @@ +FROM jupyterhub/k8s-hub:4.2.0 + +ENV JUPYTERHUB_XSRF_ANONYMOUS_IP_CIDRS="0.0.0.0/0" \ No newline at end of file diff --git a/2025-eScience/docker/Dockerfile.init b/2025-eScience/docker/Dockerfile.init new file mode 100644 index 0000000..269a316 --- /dev/null +++ b/2025-eScience/docker/Dockerfile.init @@ -0,0 +1,22 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +FROM alpine/git + +USER root + +ENV NB_USER=jovyan \ + NB_UID=1000 \ + HOME=/home/jovyan + +RUN adduser \ + -D \ + -g "Default user" \ + -u ${NB_UID} \ + -h ${HOME} \ + ${NB_USER} + +COPY ./docker/init-entrypoint.sh /entrypoint.sh +RUN chmod 777 /entrypoint.sh diff --git a/2025-eScience/docker/Dockerfile.spawn b/2025-eScience/docker/Dockerfile.spawn new file mode 100644 index 0000000..01b1877 --- /dev/null +++ b/2025-eScience/docker/Dockerfile.spawn @@ -0,0 +1,113 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# For testing +# FROM test-benchpark + +FROM ghcr.io/llnl/benchpark:hpcic-2025 + +USER root + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + dnsutils \ + iputils-ping \ + tini + +SHELL [ "/bin/bash", "-c" ] + +COPY ./docker/requirements.txt /requirements.txt + +RUN . /opt/global_py_venv/bin/activate && \ + # Needed for some viz in thicket-tutorial + /opt/global_py_venv/bin/python3 -m pip install plotly[express] && \ + /opt/global_py_venv/bin/python3 -m pip install -r /requirements.txt && \ + /opt/global_py_venv/bin/python3 -m pip install -U jupyter ipython ipykernel && \ + /opt/global_py_venv/bin/python3 -m ipykernel install --prefix=/usr/local --name 'benchpark-tutorial-kernel' + +COPY ./docker/kitware-archive.sh /tmp + +RUN /tmp/kitware-archive.sh && \ + apt install -y cmake && \ + which cmake && \ + cmake --version && \ + rm -rf /var/lib/apt/lists/* + +COPY ./tutorial-code/thicket-tutorial/requirements.txt /tmp/thicket-tutorial/requirements.txt + +RUN . /opt/global_py_venv/bin/activate && \ + python3 -m pip install extrap scikit-learn seaborn +# NOTE: we're not installing the dependencies for thicket-tutorial with +# requirements.txt because it hardcodes a version of IPython, which +# breaks all sorts of things. Instead, we extract the relevant dependencies +# and place them above. +# python3 -m pip install -r /tmp/thicket-tutorial/requirements.txt + +COPY ./tutorial-code/caliper-tutorial/apps /tmp/caliper-tutorial/apps/ +COPY ./tutorial-code/caliper-tutorial/cmake /tmp/caliper-tutorial/cmake/ + +ENV CALI_TUTORIAL_INSTALL_PREFIX=/usr + +RUN cmake \ + -B /tmp/build-basic-example \ + -C /tmp/caliper-tutorial/cmake/basic_example.cmake \ + -DCMAKE_INSTALL_PREFIX="${CALI_TUTORIAL_INSTALL_PREFIX}" \ + -S /tmp/caliper-tutorial/apps/basic_example && \ + cmake --build "/tmp/build-basic-example" && \ + cmake --install "/tmp/build-basic-example" +# && \ +# rm -rf /tmp/build-basic-example + +RUN cmake \ + -B /tmp/build-lulesh \ + -C /tmp/caliper-tutorial/cmake/lulesh-mpi.cmake \ + -DCMAKE_INSTALL_PREFIX="${CALI_TUTORIAL_INSTALL_PREFIX}" \ + -S /tmp/caliper-tutorial/apps/LULESH && \ + cmake --build "/tmp/build-lulesh" && \ + cmake --install "/tmp/build-lulesh" +# && \ +# rm -rf /tmp/build-lulesh + +RUN cmake \ + -B /tmp/build-xsbench \ + -C /tmp/caliper-tutorial/cmake/xsbench-mpi.cmake \ + -DCMAKE_INSTALL_PREFIX="${CALI_TUTORIAL_INSTALL_PREFIX}" \ + -S /tmp/caliper-tutorial/apps/XSBench && \ + cmake --build "/tmp/build-xsbench" && \ + cmake --install "/tmp/build-xsbench" +# && \ +# rm -rf /tmp/build-xsbench + +COPY ./tutorial-code/caliper-tutorial/tutorial ${HOME}/caliper-tutorial/ +COPY ./tutorial-code/caliper-tutorial/apps ${HOME}/caliper-tutorial/apps +COPY ./tutorial-code/thicket-tutorial/data/lassen ${HOME}/thicket-tutorial/data/lassen +COPY ./tutorial-code/thicket-tutorial/data/quartz ${HOME}/thicket-tutorial/data/quartz +COPY ./tutorial-code/thicket-tutorial/notebooks/01_thicket_tutorial.ipynb ${HOME}/thicket-tutorial/notebooks/01_thicket_tutorial.ipynb +COPY ./tutorial-code/thicket-tutorial/notebooks/02_thicket_rajaperf_clustering.ipynb ${HOME}/thicket-tutorial/notebooks/02_thicket_rajaperf_clustering.ipynb +COPY ./tutorial-code/thicket-tutorial/LICENSE ${HOME}/thicket-tutorial + +COPY tutorial-code/system-description/aws-tutorial ${HOME}/benchpark/systems/aws-tutorial +COPY tutorial-code/system-description/AWS_Tutorial-c7i-EFA ${HOME}/benchpark/systems/all_hardware_descriptions/AWS_Tutorial-c7i-EFA + +RUN chown -R jovyan ${HOME} +# RUN chmod -R 777 ~/ ${HOME} + +WORKDIR ${HOME} + +COPY ./docker/spawn-entrypoint.sh /entrypoint.sh +COPY ./docker/spawn-local-entrypoint.sh /local-entrypoint.sh +RUN chmod 777 /entrypoint.sh +RUN chmod 777 /local-entrypoint.sh + +USER ${NB_USER} +ENV SHELL=/usr/bin/bash +ENV FLUX_URI_RESOLVE_LOCAL=t + +EXPOSE 8888 +ENTRYPOINT [ "tini", "--" ] + +CMD ["flux", "start", "jupyter", "lab"] diff --git a/2025-eScience/docker/Dockerfile.thicket b/2025-eScience/docker/Dockerfile.thicket new file mode 100644 index 0000000..2d92eb3 --- /dev/null +++ b/2025-eScience/docker/Dockerfile.thicket @@ -0,0 +1,18 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# For testing +# FROM test-caliper + +FROM ghcr.io/llnl/caliper:escience-2025 + +USER root + +RUN . /opt/global_py_venv/bin/activate && \ + python3 -m pip install llnl-hatchet==2024.1.3 && \ + python3 -m pip install "llnl-thicket[plotting] @ git+https://github.com/LLNL/thicket.git@develop-2024-11-02" +# python3 -m pip install llnl-thicket[extrap,plotting]==2024.2.1 + +USER ${NB_USER} diff --git a/2025-eScience/docker/build_and_upload_images.sh b/2025-eScience/docker/build_and_upload_images.sh new file mode 100755 index 0000000..5ef8282 --- /dev/null +++ b/2025-eScience/docker/build_and_upload_images.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -e + +function usage { + echo "Usage: ./build_and_upload_images.sh [image_to_build]" +} + +if [ $# -lt 1 ]; then + usage + exit 1 +fi + +if [ $1 == "-h" ] || [ $1 == "--help" ]; then + usage + exit 0 +fi + +TAG="$1" + +DOCKER_PLATFORMS="linux/amd64,linux/arm64" + +TO_BUILD_IDS=( "caliper" "thicket" "benchpark" "init" "spawn" ) + +if [ $# -ge 2 ]; then + TO_BUILD_IDS=( "$2" ) +fi + +caliper_IMAGE="ghcr.io/ilumsden/escience-caliper" +thicket_IMAGE="ghcr.io/ilumsden/escience-thicket" +benchpark_IMAGE="ghcr.io/ilumsden/escience-benchpark" +init_IMAGE="ghcr.io/ilumsden/escience-test-init" +spawn_IMAGE="ghcr.io/ilumsden/escience-test-spawn" + +if ! command -v gh >/dev/null 2>&1; then + echo "This script requires the GitHub CLI (i.e., the gh command)." + echo "Install the CLI and rerun this script." + exit 1 +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "This script requires Docker." + echo "Install Docker and rerun this script." + exit 1 +fi + +echo $(gh auth token) | docker login ghcr.io -u $(gh api user --jq .login) --password-stdin + +for bid in ${TO_BUILD_IDS[@]}; do + CURR_IMAGE_NAME="${bid}_IMAGE" + docker build --platform $DOCKER_PLATFORMS -f Dockerfile.$bid -t ${!CURR_IMAGE_NAME}:$TAG . + docker push ${!CURR_IMAGE_NAME}:$TAG +done + +# docker build --platform $DOCKER_PLATFORMS -f Dockerfile.benchpark -t escience-benchpark:latest . +# docker tag escience-benchpark:latest $BENCHPARK_IMAGE:$TAG +# docker push $BENCHPARK_IMAGE:$TAG +# +# docker build --platform $DOCKER_PLATFORMS -f Dockerfile.init -t escience-init:latest . +# docker tag escience-init:latest $INIT_IMAGE:$TAG +# docker push $INIT_IMAGE:$TAG +# +# docker build --platform $DOCKER_PLATFORMS -f Dockerfile.spawn -t escience-spawn:latest . +# docker tag escience-spawn:latest $SPAWN_IMAGE:$TAG +# docker push $SPAWN_IMAGE:$TAG diff --git a/2025-eScience/docker/get_requirements_for_jupyterhub_k8s.sh b/2025-eScience/docker/get_requirements_for_jupyterhub_k8s.sh new file mode 100755 index 0000000..3790c05 --- /dev/null +++ b/2025-eScience/docker/get_requirements_for_jupyterhub_k8s.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +function usage { + echo "Usage: ./get_requirements_for_jupyterhub_k8s.sh " +} + +if [[ $# -ne 1 ]]; then + usage + exit 1 +fi + +jh_k8s_version="$1" + +if [[ "$jh_k8s_version" == "--help" ]] || [[ "$jh_k8s_version" == "-h" ]]; then + usage + exit 0 +fi + +if ! command -v gh >/dev/null; then + echo "This script requires the GitHub CLI (i.e., the 'gh' command)!" + echo "Install and try again" + exit 1 +fi + +if ! command -v curl >/dev/null; then + echo "This script requires the 'curl' command!" + echo "Install and try again" + exit 1 +fi + +if ! command -v jq >/dev/null; then + echo "This script requires the 'jq' command!" + echo "Install and try again" + exit 1 +fi + +if ! command -v jq >/dev/null; then + echo "This script requires the 'wget' command!" + echo "Install and try again" + exit 1 +fi + +if ! command -v sed >/dev/null; then + echo "This script requires the 'sed' command!" + echo "Install and try again" + exit 1 +fi + +github_url="https://api.github.com/repos/jupyterhub/zero-to-jupyterhub-k8s/contents/images/hub/requirements.txt?ref=$jh_k8s_version" + +file_lookup_json=$(curl -JL \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $(gh auth token)" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "$github_url") + +wget_url=$(echo "$file_lookup_json" | jq -r '.download_url') + +wget -O $(pwd)/requirements.txt $wget_url + +sed -i '' '/psycopg2/d' $(pwd)/requirements.txt diff --git a/2025-eScience/docker/init-entrypoint.sh b/2025-eScience/docker/init-entrypoint.sh new file mode 100755 index 0000000..c99f82f --- /dev/null +++ b/2025-eScience/docker/init-entrypoint.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env sh +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# NOTE: this script runs as root + +chown -R 1000 /home/jovyan diff --git a/2025-eScience/docker/kitware-archive.sh b/2025-eScience/docker/kitware-archive.sh new file mode 100755 index 0000000..4b37074 --- /dev/null +++ b/2025-eScience/docker/kitware-archive.sh @@ -0,0 +1,99 @@ +#!/bin/sh + +set -eu + +help() { + echo "Usage: $0 [--release ] [--rc]" > /dev/stderr +} + +doing= +rc= +release= +help= +for opt in "$@" +do + case "${doing}" in + release) + release="${opt}" + doing= + ;; + "") + case "${opt}" in + --rc) + rc=1 + ;; + --release) + doing=release + ;; + --help) + help=1 + ;; + esac + ;; + esac +done + +if [ -n "${doing}" ] +then + echo "--${doing} option given no argument." > /dev/stderr + echo > /dev/stderr + help + exit 1 +fi + +if [ -n "${help}" ] +then + help + exit +fi + +if [ -z "${release}" ] +then + unset UBUNTU_CODENAME + . /etc/os-release + + if [ -z "${UBUNTU_CODENAME+x}" ] + then + echo "This is not an Ubuntu system. Aborting." > /dev/stderr + exit 1 + fi + + release="${UBUNTU_CODENAME}" +fi + +case "${release}" in +noble|jammy|focal) + packages= + keyring_packages="ca-certificates gpg wget" + ;; +*) + echo "Only Ubuntu Noble (24.04), Jammy (22.04), and Focal (20.04) are supported. Aborting." > /dev/stderr + exit 1 + ;; +esac + +get_keyring= +if [ ! -f /usr/share/doc/kitware-archive-keyring/copyright ] +then + packages="${packages} ${keyring_packages}" + get_keyring=1 +fi + +# Start the real work +set -x + +apt-get update +# shellcheck disable=SC2086 +apt-get install -y ${packages} + +test -n "${get_keyring}" && (wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - > /usr/share/keyrings/kitware-archive-keyring.gpg) + +echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ ${release} main" > /etc/apt/sources.list.d/kitware.list +if [ -n "${rc}" ] +then + echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ ${release}-rc main" >> /etc/apt/sources.list.d/kitware.list +fi + +apt-get update +test -n "${get_keyring}" && rm /usr/share/keyrings/kitware-archive-keyring.gpg +apt-get install -y kitware-archive-keyring \ No newline at end of file diff --git a/2025-eScience/docker/requirements.txt b/2025-eScience/docker/requirements.txt new file mode 100644 index 0000000..6cafa51 --- /dev/null +++ b/2025-eScience/docker/requirements.txt @@ -0,0 +1,269 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# Use the "Run workflow" button at https://github.com/jupyterhub/zero-to-jupyterhub-k8s/actions/workflows/watch-dependencies.yaml +# +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.11.16 + # via kubernetes-asyncio +aiosignal==1.3.2 + # via aiohttp +alembic==1.15.2 + # via jupyterhub +annotated-types==0.7.0 + # via pydantic +arrow==1.3.0 + # via isoduration +attrs==25.3.0 + # via + # aiohttp + # jsonschema + # referencing +bcrypt==4.3.0 + # via + # jupyterhub-firstuseauthenticator + # jupyterhub-nativeauthenticator +cachetools==5.5.2 + # via google-auth +certifi==2025.1.31 + # via + # kubernetes-asyncio + # requests +certipy==0.2.2 + # via jupyterhub +cffi==1.17.1 + # via cryptography +charset-normalizer==3.4.1 + # via requests +cryptography==44.0.2 + # via + # certipy + # pyjwt +escapism==1.0.1 + # via + # jupyterhub-kubespawner + # jupyterhub-ltiauthenticator +fqdn==1.5.1 + # via jsonschema +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal +google-auth==2.39.0 + # via google-auth-oauthlib +google-auth-oauthlib==1.2.1 + # via oauthenticator +greenlet==3.1.1 + # via sqlalchemy +idna==3.10 + # via + # jsonschema + # jupyterhub + # requests + # yarl +isoduration==20.11.0 + # via jsonschema +jinja2==3.1.6 + # via + # jupyterhub + # jupyterhub-kubespawner +jsonpointer==3.0.0 + # via jsonschema +jsonschema==4.23.0 + # via + # jupyter-events + # oauthenticator +jsonschema-specifications==2024.10.1 + # via jsonschema +jupyter-events==0.12.0 + # via jupyterhub +jupyterhub==5.3.0 + # via + # -r unfrozen/requirements.txt + # jupyterhub-firstuseauthenticator + # jupyterhub-kubespawner + # jupyterhub-ldapauthenticator + # jupyterhub-ltiauthenticator + # jupyterhub-nativeauthenticator + # oauthenticator +jupyterhub-firstuseauthenticator==1.1.1 + # via -r unfrozen/requirements.txt +jupyterhub-hmacauthenticator==1.0 + # via -r unfrozen/requirements.txt +jupyterhub-idle-culler==1.4.0 + # via -r unfrozen/requirements.txt +jupyterhub-kubespawner==7.0.0 + # via -r unfrozen/requirements.txt +jupyterhub-ldapauthenticator==2.0.2 + # via -r unfrozen/requirements.txt +jupyterhub-ltiauthenticator==1.6.2 + # via -r unfrozen/requirements.txt +jupyterhub-nativeauthenticator==1.3.0 + # via -r unfrozen/requirements.txt +jupyterhub-tmpauthenticator==1.0.0 + # via -r unfrozen/requirements.txt +kubernetes-asyncio==32.3.0 + # via jupyterhub-kubespawner +ldap3==2.9.1 + # via jupyterhub-ldapauthenticator +mako==1.3.10 + # via alembic +markupsafe==3.0.2 + # via + # jinja2 + # mako +multidict==6.4.3 + # via + # aiohttp + # yarl +mwoauth==0.4.0 + # via oauthenticator +oauthenticator==17.3.0 + # via -r unfrozen/requirements.txt +oauthlib==3.2.2 + # via + # jupyterhub + # jupyterhub-ltiauthenticator + # mwoauth + # requests-oauthlib +onetimepass==1.0.1 + # via jupyterhub-nativeauthenticator +packaging==24.2 + # via + # jupyter-events + # jupyterhub + # jupyterhub-idle-culler +pamela==1.2.0 + # via jupyterhub +prometheus-client==0.21.1 + # via jupyterhub +propcache==0.3.1 + # via + # aiohttp + # yarl + # via -r unfrozen/requirements.txt +pyasn1==0.6.1 + # via + # ldap3 + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth +pycparser==2.22 + # via cffi +pycurl==7.45.6 + # via -r unfrozen/requirements.txt +pydantic==2.11.3 + # via jupyterhub +pydantic-core==2.33.1 + # via pydantic +pyjwt==2.10.1 + # via + # jupyterhub-ltiauthenticator + # mwoauth + # oauthenticator +pymysql==1.1.1 + # via -r unfrozen/requirements.txt +python-dateutil==2.9.0.post0 + # via + # arrow + # jupyterhub + # jupyterhub-idle-culler + # kubernetes-asyncio +python-json-logger==3.3.0 + # via jupyter-events +python-slugify==8.0.4 + # via jupyterhub-kubespawner +pyyaml==6.0.2 + # via + # jupyter-events + # jupyterhub-kubespawner + # kubernetes-asyncio +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +requests==2.32.3 + # via + # jupyterhub + # mwoauth + # oauthenticator + # requests-oauthlib +requests-oauthlib==2.0.0 + # via + # google-auth-oauthlib + # mwoauth +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rpds-py==0.24.0 + # via + # jsonschema + # referencing +rsa==4.9 + # via google-auth +ruamel-yaml==0.18.10 + # via oauthenticator +ruamel-yaml-clib==0.2.12 + # via ruamel-yaml +six==1.17.0 + # via + # kubernetes-asyncio + # onetimepass + # python-dateutil + # rfc3339-validator +sqlalchemy==2.0.40 + # via + # alembic + # jupyterhub + # sqlalchemy-cockroachdb +sqlalchemy-cockroachdb==2.0.2 + # via -r unfrozen/requirements.txt +statsd==4.0.1 + # via -r unfrozen/requirements.txt +text-unidecode==1.3 + # via python-slugify +tornado==6.4.2 + # via + # jupyterhub + # jupyterhub-idle-culler + # oauthenticator +traitlets==5.14.3 + # via + # jupyter-events + # jupyterhub + # jupyterhub-kubespawner + # jupyterhub-ldapauthenticator + # jupyterhub-ltiauthenticator + # oauthenticator +types-python-dateutil==2.9.0.20241206 + # via arrow +typing-extensions==4.13.2 + # via + # alembic + # pydantic + # pydantic-core + # referencing + # sqlalchemy + # typing-inspection +typing-inspection==0.4.0 + # via pydantic +uri-template==1.3.0 + # via jsonschema +urllib3==2.4.0 + # via + # jupyterhub-kubespawner + # kubernetes-asyncio + # requests +webcolors==24.11.1 + # via jsonschema +yarl==1.19.0 + # via aiohttp diff --git a/2025-eScience/docker/spawn-entrypoint.sh b/2025-eScience/docker/spawn-entrypoint.sh new file mode 100755 index 0000000..de4c271 --- /dev/null +++ b/2025-eScience/docker/spawn-entrypoint.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright 2025 Lawrence Livermore National Security, LLC and other +# Benchpark developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +# TODO uncomment if we want multiple "nodes" +# num_cores_per_node=2 +# total_num_cores=$(nproc --all) +# num_brokers=$(( $total_num_cores / $num_cores_per_node )) +# /usr/bin/mpiexec.hydra -n $num_brokers -bind-to core:$num_cores_per_node /usr/bin/flux start /opt/global_py_venv/bin/jupyterhub-singleuser + +# NOTE: use this if we only want a single "node" +if [[ $# -ne 1 ]]; then + /usr/bin/flux start /opt/global_py_venv/bin/jupyterhub-singleuser +else + last_core_id=$(( $1 - 1 )) + mkdir -p ${HOME}/.flux + cat > ${HOME}/.flux/resource.toml < ${HOME}/.flux/resource.toml </dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +hub_pod_id=$(kubectl get pods -n default --no-headers=true | awk '/hub/{print $1}') +kubectl logs $hub_pod_id \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/check_init_container_log.sh b/2025-eScience/infrastructure/dry-run/check_init_container_log.sh new file mode 100755 index 0000000..f4fd398 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/check_init_container_log.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if [ $# -ne 1 ]; then + echo "Usage: ./check_init_container_log.sh " + exit 1 +fi + +kubectl logs $1 -c init-tutorial-service \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/check_jupyterhub_status.sh b/2025-eScience/infrastructure/dry-run/check_jupyterhub_status.sh new file mode 100755 index 0000000..10b4261 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/check_jupyterhub_status.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl --namespace=default get pods + +echo "If there are issues with any pods, you can get more details with:" +echo " $ kubectl --namespace=default describe pod " \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/cleanup.sh b/2025-eScience/infrastructure/dry-run/cleanup.sh new file mode 100755 index 0000000..0174bbb --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/cleanup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +# Temporarily allow errors in the script so that the script won't fail +# if the JupyterHub deployment failed or was previously torn down +set +e +echo "Tearing down JupyterHub and uninstalling everything related to Helm:" +helm uninstall escience-2025-dry-run-jupyter +set -e + +echo "" +echo "Deleting all pods from the EKS cluster:" +kubectl delete pod --all-namespaces --all --force + +echo "" +echo "Deleting the EKS cluster:" +eksctl delete cluster --config-file ./eksctl-config.yaml --wait + +echo "" +echo "Everything is now cleaned up!" diff --git a/2025-eScience/infrastructure/dry-run/cluster-autoscaler.yaml b/2025-eScience/infrastructure/dry-run/cluster-autoscaler.yaml new file mode 100644 index 0000000..3c884ae --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/cluster-autoscaler.yaml @@ -0,0 +1,272 @@ +# The roles defined in this config file set permissions on several Kubernetes resources. +# +# Resources referred to: +# * events: resource representing information/responses generated from actions or changes taken against the cluster +# * endpoints: resource representing REST API endpoints within the cluster +# * pods/eviction: resource that terminates and removes pods when created +# * pods/status: resource used to query or edit the status of pods +# * nodes: resource representing the physical or virtual nodes of the cluster +# * namespaces: resource representing a group of isolated resources within the cluster +# * pods: resource representing a unit of computation that is deployed to a node +# * services: resource representing a networked application running in a pod and exposed over the network (either internal to the cluster or external to the broader internet) +# * replicationcontrollers: legacy resource for managing horizontal scaling (i.e., scale-out). Used for broader support across clouds +# * persistantvolumeclaims: resource representing a request for storage by a user +# * persistantvolumes: resource representing actual storage +# * replicasets: resource that creates replica pods that are used to ensure some minimum number of identical pods in the cluster +# * daemonsets: resource that ensures copies of pods are deployed to new nodes and removed from removed nodes +# * poddisruptionbudgets: resource that represents the cluster policy regarding the minimum number of pods that must remain available +# during voluntary disruptions (i.e., pod/node eviction not caused by something like hardware failure) +# * statefulsets: resource that maintains pod state +# * storageclasses: resource that describes different types of storage. Often used for things like QoS levels +# * csinodes: resource that describes a node's ability to interact with one or more storage providers. Mainly used by Kubernetes's scheduler +# * csidrivers: resource that provide information on the drivers for a single storage provider installed on a node +# * csistoragecapacities: resource that describes the available storage from different providers +# * jobs: resource that represents one-off tasks spread across one or more pods that must run to completion. Useful for certain types of setup and elasticity work +# * leases: resource that allows different pods, nodes, or kublets (kubernetes daemon on a node) to lock shared resources. Think of it like a mutex +# * configmaps: resource representing non-confidential key-value pair info. Often used to decouple environment-specific configuration from container images +--- +# Create a Service Account that will act as the internal user during the creation +# of the autoscaling infrastructure and have all the appropriate roles and permissions assigned +# to do its work +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +# Create a ClusterRole to set permissions for associated +# users across the entire cluster +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create or partially update events and endpoints + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + # Allow associated users to evict pods + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + # Allow associated users to update pod statuses + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + # Allow associated users to get and update the state of the autoscaler + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + # Allow associated users to be notified of changes to, list, get the state of, + # and fully update information related to nodes + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + # Allow associated users to be notified of changes to, list, and get the state of + # namespaces, pods, services, replication controllers, persistent volume claims, and + # persistent volumes + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to, list, and get the state of + # replica sets, and daemon sets + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to and list pod disruption budgets + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + # Allow associated users to be notified of changes to, list, and get the state of + # stateful sets, replica sets, and daemon sets + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of chagnes to, list, and get the state of + # all resources related to available storage + - apiGroups: ["storage.k8s.io"] + resources: + ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + # Allow associated users to get the state of, list, be notified of chagnes to, and partially update + # jobs launched in the cluster + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + # Allow associated users to create leases + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + # Allow associated users to get the state of and fully update leases in the autoscaler + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +# Create a Role to set permissions within the 'kube-system' namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + # The permissions in this Role apply to the 'kube-system' namespace + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create, list, and be notified of changes to config maps + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "list", "watch"] + # Allow associated users to delete, get the state of, fully update, and be notified of + # changes to config maps in the autoscaler's status and priority-expander subresources + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - "cluster-autoscaler-status" + - "cluster-autoscaler-priority-expander" + verbs: ["delete", "get", "update", "watch"] +--- +# Grant permissions defined by the ClusterRole +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the ClusterRole named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Grant permissions defined by the Role +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the Role named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Define deployment rules for pods and ReplicaSets +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 # Number of pods to run + # Apply to pods where the app has a label called 'app' + # with value 'cluster-autoscaler' + selector: + matchLabels: + app: cluster-autoscaler + # Definition of created pods + template: + metadata: + labels: + app: cluster-autoscaler + # Allow Prometheus to collect monitoring data over port 8085 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8085" + spec: + priorityClassName: system-cluster-critical + securityContext: + # The Kubelet must be run as a non-root user + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + # Use the default seccomp profile as specified by the + # container runtime + seccompProfile: + type: RuntimeDefault + serviceAccountName: cluster-autoscaler + # The container(s) to run within the pod. + # Since we're running an autoscaler, we'll run the autoscaler + # as the pod's only container, and then we'll deploy other + # containers within the autoscaler to actually do work + containers: + # The main container for the pod will be the + # Kubernetes autoscaling container + - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.2 + name: cluster-autoscaler + resources: + # Maximum amount of compute resources allowed + limits: + cpu: 100m + memory: 600Mi + # Minimum amount of compute resources required + # Defaults to 'limits' if not specified + requests: + cpu: 100m + memory: 600Mi + command: + - ./cluster-autoscaler + - --v=4 + - --stderrthreshold=info + - --cloud-provider=aws + - --skip-nodes-with-local-storage=false + - --expander=least-waste + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/escience-2025-dry-run + volumeMounts: + # Mount the CA SSL/TLS certificates into the container + - name: ssl-certs + mountPath: /etc/ssl/certs/ca-certificates.crt + readOnly: true + # Always pull the digest of the image from the + # container registry. If the locally cached digest is + # the same as the pulled digest, use the cached container image. + # Otherwise, pull the container from the registry + imagePullPolicy: "Always" + securityContext: + # Don't let the pod have more privileges than the + # parent process + allowPrivilegeEscalation: false + capabilities: + # Remove all capabilities + drop: + - ALL + # Root filesystem (i.e., '/') is read-only + readOnlyRootFilesystem: true + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/2025-eScience/infrastructure/dry-run/config.toml b/2025-eScience/infrastructure/dry-run/config.toml new file mode 100644 index 0000000..1157a2d --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/config.toml @@ -0,0 +1,54 @@ +tutorial_name = "escience-2025-dry-run" + +[aws.eksctl] +cluster_name = "escience-2025-dry-run" +cluster_deployment_region = "us-west-1" +cluster_availability_zones = [ + "us-west-1a", + "us-west-1c", +] + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1a" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[[aws.eksctl.cluster_node_groups]] +zone = "us-west-1c" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 2 +min_size = 2 +max_size = 8 + +[aws."Kubernetes autoscaler"] +cpu_max = "100m" +memory_max = "600Mi" +cpu_min = "100m" +memory_min = "600Mi" + +[aws.Helm] +max_concurrent_users = 14 +hub_password = "hpctutorial25" +hub_db_capacity = "32Gi" +ebs_storage_type = "gp3" +hub_container_image = "jupyterhub/k8s-hub" +hub_container_tag = "4.2.0" +spawner_container_image = "ghcr.io/llnl/reproducible-benchmarking-spawn" +spawner_container_tag = "hpcic-2025" +spawner_image_entrypoint = "/entrypoint.sh 32" +cpu_min = "32" +cpu_max = "32" +mem_min = "64G" +mem_max = "64G" +provide_extra_shmem = true +init_container_image = "ghcr.io/llnl/reproducible-benchmarking-init" +init_container_tag = "hpcic-2025" +init_image_entrypoint = "/entrypoint.sh" + +[aws."utility scripts"] +jupyterhub_helm_version = "4.2.0" +ebs_csidriver_version = "v1.45.0" diff --git a/2025-eScience/infrastructure/dry-run/configure_kubernetes.sh b/2025-eScience/infrastructure/dry-run/configure_kubernetes.sh new file mode 100755 index 0000000..5c4bee6 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/configure_kubernetes.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +echo "Configuring the Cluster Autoscaler:" +kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=v1.45.0" +kubectl apply -f ./cluster-autoscaler.yaml +echo "" +echo "Configuring the Storage Class:" +kubectl apply -f ./storage-class.yaml + +echo "" +echo "Patching the cluster to make the configured storage class the default:" +kubectl patch storageclass gp3 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' + +echo "" +echo "Done configuring Kubernetes!" +echo "" +echo "Next, you should run deploy_jupyterhub.sh to actually deploy JupyterHub and the tutorial." \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/create_cluster.sh b/2025-eScience/infrastructure/dry-run/create_cluster.sh new file mode 100755 index 0000000..f631168 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/create_cluster.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +echo "Creating EKS cluster with eksctl:" +eksctl create cluster --config-file ./eksctl-config.yaml + +echo "Done creating the EKS cluster!" +echo "" +echo "Next, you should run configure_kubernetes.sh to configure Kubernetes on the cluster." \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/deploy_jupyterhub.sh b/2025-eScience/infrastructure/dry-run/deploy_jupyterhub.sh new file mode 100755 index 0000000..b7c91b2 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/deploy_jupyterhub.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +echo "Adding JupyterHub to EKS cluster using Helm:" +helm repo add jupyterhub https://hub.jupyter.org/helm-chart/ +helm repo update +echo "" +echo "Installing the Helm chart and deploying JupyterHub to EKS:" +helm install escience-2025-dry-run-jupyter jupyterhub/jupyterhub --version 4.2.0 --values ./helm-config.yaml + +echo "" +echo "Done deploying JupyterHub!" +echo "" +echo "Next, you should ensure all the pods spawned correctly with check_jupyterhub_status.sh," +echo "and you should get the cluster URL with get_jupyterhub_url.sh." +echo "" +echo "If something went wrong, you can edit the helm-config.yaml file to try to fix the issue." +echo "After editing helm-config.yaml, you can normally reconfigure and relaunch JupyterHub using" +echo "the update_jupyterhub_deployment.sh script. If that doesn't work or if you need to edit" +echo "storage-class.yaml or cluster-autoscaler.yaml, you should first tear down JupyterHub with" +echo "tear_down_jupyterhub.sh, and then you should bring Jupyter back up by rerunning deploy_jupyterhub.sh." +echo "" +echo "If everything went smoothly, the cluster URL is what you should share with attendees." +echo "" +echo "Attendees can get a Jupyter environment to work in by going to that URL and logging in" +echo "with a username of their choice and the password specified in helm-config.yaml." +echo "" +echo "Note: users should have unique usernames. If two users have the same username, they will" +echo " share the same pod." +echo "" +echo "After you are done with your tutorial, you should finally run cleanup.sh to bring down" +echo "the EKS cluster and all associated resources." diff --git a/2025-eScience/infrastructure/dry-run/eksctl-config.yaml b/2025-eScience/infrastructure/dry-run/eksctl-config.yaml new file mode 100644 index 0000000..7301cf7 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/eksctl-config.yaml @@ -0,0 +1,110 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +# Define the name of the cluster and the deployment region +metadata: + name: escience-2025-dry-run + region: us-west-1 + +# Create the IAM policies needed to enable the autoscaler and storage +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: cluster-autoscaler + + # https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "autoscaling:DescribeAutoScalingGroups" + - "autoscaling:DescribeAutoScalingInstances" + - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeTags" + - "autoscaling:SetDesiredCapacity" + - "autoscaling:TerminateInstanceInAutoScalingGroup" + - "ec2:DescribeLaunchTemplateVersions" + Resource: "*" + + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: "*" + +# Specify the availability zone from which nodes will be obtained +availabilityZones: +- "us-west-1a" +- "us-west-1c" + + +# Define rules for nodegroups for each availability zone +managedNodeGroups: + + - name: node-group-us-west-1a + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1a + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: node-group-us-west-1c + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 2 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 2 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 8 + privateNetworking: true + availabilityZones: + - us-west-1c + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" diff --git a/2025-eScience/infrastructure/dry-run/get_jupyterhub_url.sh b/2025-eScience/infrastructure/dry-run/get_jupyterhub_url.sh new file mode 100755 index 0000000..ddfd250 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/get_jupyterhub_url.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl get -o json service proxy-public | jq '.status.loadBalancer.ingress[0].hostname' \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/helm-config.yaml b/2025-eScience/infrastructure/dry-run/helm-config.yaml new file mode 100644 index 0000000..2a20e7b --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/helm-config.yaml @@ -0,0 +1,121 @@ +# Uncomment if you need to debug your deployment of Jupyter. +# For more information on debugging, see: +# https://z2jh.jupyter.org/en/stable/administrator/debug.html +# debug: +# enabled: true + +hub: + # Maximum number of users with spawned JupyterLab environments (i.e., pods) at a time + concurrentSpawnLimit: 14 + config: + # Define a password for login + DummyAuthenticator: + password: hpctutorial25 + JupyterHub: + admin_access: true + authenticator_class: dummy + + # Define storage quantity for JupyterHub's persistent database + # We could explicitly set storage class name here, + # but we won't because we've marked the storage class defined + # in storage-class.yaml as default + db: + pvc: + storage: 32Gi + storageClassName: gp3 + + # Specify the hub image for the tutorial. + # The hub image should be based off of the jupyterhub/k8s-hub image. + # Its job is twofold: + # 1) If desired, replace the login page (at /usr/local/share/jupyterhub/templates/login.html) with a custom HTML login page + # 2) Set the user + image: + name: jupyterhub/k8s-hub + tag: "4.2.0" + pullPolicy: Always + + # Define resource usage for JupyterHub + # For large tutorials, it is recommended to set these higher + + # We are just using defualt resource usage + + + # Define custom hostname for JupyterHub + + # We are not using a custom hostname + + +# Based on optimization recommendations from: +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +# scheduling: +# podPriority: +# enabled: true +# userPlaceholder: +# replicas: 3 + +# Define the spawner and init containers for each attendee's pod +singleuser: + # Specify the spawner image for the tutorial. + # The spawner image should do the following: + # 1) Install any necessary software + # 2) Define the user for the tutorial (we usually default to jovyan) + # 3) If custom Python packages are needed, it's often recommended to install a custom Jupyter kernel with `IPython kernel install` + # 4) If you want a custom Jupyter launcher UI, install the appropriate packages and update JUPYTER_APP_LAUNCHER_PATH + # 5) Copy any necessary local scripts or files and ensure proper permissions + image: + name: ghcr.io/llnl/reproducible-benchmarking-spawn + tag: "escience-2025" + pullPolicy: Always + # Specify the minimum (i.e., guarantee) and maximum (i.e., limit) amount of resources per user + cpu: + limit: 32 + guarantee: 32 + memory: + limit: "64G" + guarantee: "64G" + # If needed, specify a custom entrypoint into the spawner image. + # For more information, look at the documentation for Docker ENTRYPOINT and CMD directives: + # https://www.docker.com/blog/docker-best-practices-choosing-between-run-cmd-and-entrypoint/ + cmd: ["/entrypoint.sh", "32"] + # Specify the init image for the tutorial. + # This image is optional, but it can be used to do last second configuration or installation of files + # before the user gains control of the pod. + # + # A good usecase for the init image is to set permissions and ensure the tutorial user will be able to + # access the files for your tutorial. An example Dockerfile for the init image may look like: + # + # Dockerfile: + # FROM alpine/git + # ENV NB_USER=jovyan \ + # NB_UID=1000 \ + # HOME=/home/jovyan + # + # RUN adduser \ + # -D \ + # -g "Default user" \ + # -u ${NB_UID} \ + # -h ${HOME} \ + # ${NB_USER} + # + # COPY ./init-entrypoint.sh /entrypoint.sh + # + # The 'command' field for the init container specifies the entrypoint for the container. For the Dockerfile + # above, the entrypoint should be "/entrypoint.sh". This script could look something like this: + # + # entrypoint.sh (would be ./init-entrypoint.sh on your local computer) + # chown -R 1000 /home/jovyan + initContainers: + - name: init-tutorial-service + image: ghcr.io/llnl/reproducible-benchmarking-init:escience-2025 + command: ["/entrypoint.sh"] + imagePullPolicy: Always + storage: + type: none + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm + diff --git a/2025-eScience/infrastructure/dry-run/storage-class.yaml b/2025-eScience/infrastructure/dry-run/storage-class.yaml new file mode 100644 index 0000000..b83a030 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/storage-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 +provisioner: kubernetes.io/aws-ebs +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete \ No newline at end of file diff --git a/2025-eScience/infrastructure/dry-run/tear_down_jupyterhub.sh b/2025-eScience/infrastructure/dry-run/tear_down_jupyterhub.sh new file mode 100755 index 0000000..d0bb101 --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/tear_down_jupyterhub.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm uninstall escience-2025-dry-run-jupyter + +echo "Helm's JupyterHub deployment is torn down." +echo "If any attendee pods are remaining, you can delete them with 'kubectl delete pod '" +echo "" +echo "To recreate the JupyterHub deployment, just run deploy_jupyterhub.sh again." diff --git a/2025-eScience/infrastructure/dry-run/update_jupyterhub_deployment.sh b/2025-eScience/infrastructure/dry-run/update_jupyterhub_deployment.sh new file mode 100755 index 0000000..dfdca5d --- /dev/null +++ b/2025-eScience/infrastructure/dry-run/update_jupyterhub_deployment.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm upgrade escience-2025-dry-run-jupyter jupyterhub/jupyterhub --values ./helm-config.yaml + +echo "The JupyterHub deployment is updated!" diff --git a/2025-eScience/infrastructure/production/README.md b/2025-eScience/infrastructure/production/README.md new file mode 100644 index 0000000..ab35b5e --- /dev/null +++ b/2025-eScience/infrastructure/production/README.md @@ -0,0 +1,114 @@ +# Deploy escience-2025-tutorial to AWS Elastic Kubernetes Service (EKS) + +These config files and scripts can be used to deploy the escience-2025-tutorial tutorial to EKS. + +The sections below walk you through the steps to deploying your cluster. All commands in these +sections should be run from the same directory as this README. + +## Step 1: Create EKS cluster + +To create an EKS cluster with your configured settings, run the following: + +```bash +$ ./create_cluster.sh +``` + +Be aware that this step can take upwards of 15-30 minutes to complete. + +## Step 2: Configure Kubernetes within the EKS cluster + +After creating the cluster, we need to configure Kubernetes and its addons. In particular, +we need to setup the Kubernetes autoscaler, which will allow our tutorial to scale to as +many users as our cluster's resources can possibly handle. + +To configure Kubernetes and the autoscaler, run the following: + +```bash +$ ./configure_kubernetes.sh +``` + +## Step 3: Deploy JupyterHub to the EKS cluster + +With the cluster properly created and configured, we now can deploy JupyterHub to the cluster +to manage everything else about our tutorial. + +To deploy JupyterHub, run the following: + +```bash +$ ./deploy_jupyterhub.sh +``` + +## Step 4: Verify that everything is working + +After deploying JupyterHub, we need to make sure that all the necessary components +are working properly. + +To check this, run the following: + +```bash +$ ./check_jupyterhub_status.sh +``` + +If everything worked properly, you should see an output like this: + +``` +NAME READY STATUS RESTARTS AGE +continuous-image-puller-2gqrw 1/1 Running 0 30s +continuous-image-puller-gb7mj 1/1 Running 0 30s +hub-8446c9d589-vgjlw 1/1 Running 0 30s +proxy-7d98df9f7-s5gft 1/1 Running 0 30s +user-scheduler-668ff95ccf-fw6wv 1/1 Running 0 30s +user-scheduler-668ff95ccf-wq5xp 1/1 Running 0 30s +``` + +Be aware that the hub pod (i.e., hub-8446c9d589-vgjlw above) may take a minute or so to start. + +If something went wrong, you will have to edit the config YAML files to get things working. Before +trying to work things out yourself, check the FAQ to see if your issue has already been addressed. + +Depending on what file you edit, you may have to run different commands to update the EKS cluster and +deployment of JupyterHub. Follow the steps below to update: +1. If you only edited `helm-config.yaml`, try to just update the deployment of Jupyterhub by running `./update_jupyterhub_deployment.sh` +2. If step 1 failed, fully tear down the JupyterHub deployment with `./tear_down_jupyterhub.sh` and then re-deploy it with `./deploy_jupyterhub.sh` +3. If you edited `cluster-autoscaler.yaml` or `storage-class.yaml`, tear down the JupyterHub deployment with `./tear_down_jupyterhub.sh`. Then, reconfigure Kubernetes with `./configure_kubernetes.sh`, and re-deploy JupyterHub with `./deploy_jupyterhub.sh` +4. If you edited `eksctl-config.yaml`, fully tear down the cluster with `cleanup.sh`, and then restart from the top of this README + +## Step 5: Get the public cluster URL + +Now that everything's ready to go, we need to get the public URL to the cluster. + +To do this, run the following: + +```bash +$ ./get_jupyterhub_url.sh +``` + +Note that it can take several minutes after the URL is available for it to actually redirect +to JupyterHub. + +## Step 6: Distribute URL and password to attendees + +Now that we have our pulbic URL, we can give the attendees everything they need to join the tutorial. + +For attendees to access JupyterHub, they simply need to enter the public URL (from step 5) in their browser of choice. +This will take them to a login page. The login credentials are as follows: +* Username: anything the attendee wants (note: this should be unique for every user. Otherwise, users will share pods.) +* Password: the password specified towards the top of `helm-config.yaml` + +Once the attendees log in with these credentials, the Kubernetes autoscaler will spin up a pod for them (and grab new +resources, if needed). This pod will contain a JupyterLab instace with the tutorial materials and environment already +prepared for them. + +At this point, you can start presenting your interactive tutorial! + +## Step 7: Cleanup everything + +Once you are done with your tutorial, you should cleanup everything so that there are not continuing, unneccesary expenses +to your AWS account. To do this, simply run the following: + +```bash +$ ./cleanup.sh +``` + +After cleaning everything up, you can verify that everything has been cleaned up by going to the AWS web consle +and ensuring nothing from your tutorial still exists in CloudFormation and EKS. diff --git a/2025-eScience/infrastructure/production/check_hub_log.sh b/2025-eScience/infrastructure/production/check_hub_log.sh new file mode 100755 index 0000000..1c13e91 --- /dev/null +++ b/2025-eScience/infrastructure/production/check_hub_log.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +hub_pod_id=$(kubectl get pods -n default --no-headers=true | awk '/hub/{print $1}') +kubectl logs $hub_pod_id \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/check_init_container_log.sh b/2025-eScience/infrastructure/production/check_init_container_log.sh new file mode 100755 index 0000000..f4fd398 --- /dev/null +++ b/2025-eScience/infrastructure/production/check_init_container_log.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if [ $# -ne 1 ]; then + echo "Usage: ./check_init_container_log.sh " + exit 1 +fi + +kubectl logs $1 -c init-tutorial-service \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/check_jupyterhub_status.sh b/2025-eScience/infrastructure/production/check_jupyterhub_status.sh new file mode 100755 index 0000000..10b4261 --- /dev/null +++ b/2025-eScience/infrastructure/production/check_jupyterhub_status.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl --namespace=default get pods + +echo "If there are issues with any pods, you can get more details with:" +echo " $ kubectl --namespace=default describe pod " \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/cleanup.sh b/2025-eScience/infrastructure/production/cleanup.sh new file mode 100755 index 0000000..0e2ecbd --- /dev/null +++ b/2025-eScience/infrastructure/production/cleanup.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +# Temporarily allow errors in the script so that the script won't fail +# if the JupyterHub deployment failed or was previously torn down +set +e +echo "Tearing down JupyterHub and uninstalling everything related to Helm:" +helm uninstall escience-2025-tutorial-jupyter +set -e + +echo "" +echo "Deleting all pods from the EKS cluster:" +kubectl delete pod --all-namespaces --all --force + +echo "" +echo "Deleting the EKS cluster:" +eksctl delete cluster --config-file ./eksctl-config.yaml --wait + +echo "" +echo "Everything is now cleaned up!" diff --git a/2025-eScience/infrastructure/production/cluster-autoscaler.yaml b/2025-eScience/infrastructure/production/cluster-autoscaler.yaml new file mode 100644 index 0000000..fb5fc59 --- /dev/null +++ b/2025-eScience/infrastructure/production/cluster-autoscaler.yaml @@ -0,0 +1,272 @@ +# The roles defined in this config file set permissions on several Kubernetes resources. +# +# Resources referred to: +# * events: resource representing information/responses generated from actions or changes taken against the cluster +# * endpoints: resource representing REST API endpoints within the cluster +# * pods/eviction: resource that terminates and removes pods when created +# * pods/status: resource used to query or edit the status of pods +# * nodes: resource representing the physical or virtual nodes of the cluster +# * namespaces: resource representing a group of isolated resources within the cluster +# * pods: resource representing a unit of computation that is deployed to a node +# * services: resource representing a networked application running in a pod and exposed over the network (either internal to the cluster or external to the broader internet) +# * replicationcontrollers: legacy resource for managing horizontal scaling (i.e., scale-out). Used for broader support across clouds +# * persistantvolumeclaims: resource representing a request for storage by a user +# * persistantvolumes: resource representing actual storage +# * replicasets: resource that creates replica pods that are used to ensure some minimum number of identical pods in the cluster +# * daemonsets: resource that ensures copies of pods are deployed to new nodes and removed from removed nodes +# * poddisruptionbudgets: resource that represents the cluster policy regarding the minimum number of pods that must remain available +# during voluntary disruptions (i.e., pod/node eviction not caused by something like hardware failure) +# * statefulsets: resource that maintains pod state +# * storageclasses: resource that describes different types of storage. Often used for things like QoS levels +# * csinodes: resource that describes a node's ability to interact with one or more storage providers. Mainly used by Kubernetes's scheduler +# * csidrivers: resource that provide information on the drivers for a single storage provider installed on a node +# * csistoragecapacities: resource that describes the available storage from different providers +# * jobs: resource that represents one-off tasks spread across one or more pods that must run to completion. Useful for certain types of setup and elasticity work +# * leases: resource that allows different pods, nodes, or kublets (kubernetes daemon on a node) to lock shared resources. Think of it like a mutex +# * configmaps: resource representing non-confidential key-value pair info. Often used to decouple environment-specific configuration from container images +--- +# Create a Service Account that will act as the internal user during the creation +# of the autoscaling infrastructure and have all the appropriate roles and permissions assigned +# to do its work +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +# Create a ClusterRole to set permissions for associated +# users across the entire cluster +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create or partially update events and endpoints + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + # Allow associated users to evict pods + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + # Allow associated users to update pod statuses + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + # Allow associated users to get and update the state of the autoscaler + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + # Allow associated users to be notified of changes to, list, get the state of, + # and fully update information related to nodes + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + # Allow associated users to be notified of changes to, list, and get the state of + # namespaces, pods, services, replication controllers, persistent volume claims, and + # persistent volumes + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to, list, and get the state of + # replica sets, and daemon sets + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of changes to and list pod disruption budgets + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + # Allow associated users to be notified of changes to, list, and get the state of + # stateful sets, replica sets, and daemon sets + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + # Allow associated users to be notified of chagnes to, list, and get the state of + # all resources related to available storage + - apiGroups: ["storage.k8s.io"] + resources: + ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + # Allow associated users to get the state of, list, be notified of chagnes to, and partially update + # jobs launched in the cluster + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + # Allow associated users to create leases + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + # Allow associated users to get the state of and fully update leases in the autoscaler + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +# Create a Role to set permissions within the 'kube-system' namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + # The permissions in this Role apply to the 'kube-system' namespace + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + # Allow associated users to create, list, and be notified of changes to config maps + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "list", "watch"] + # Allow associated users to delete, get the state of, fully update, and be notified of + # changes to config maps in the autoscaler's status and priority-expander subresources + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - "cluster-autoscaler-status" + - "cluster-autoscaler-priority-expander" + verbs: ["delete", "get", "update", "watch"] +--- +# Grant permissions defined by the ClusterRole +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the ClusterRole named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Grant permissions defined by the Role +# to users defined by the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addons: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +# Use the Role named "cluster-autoscaler" in the binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +# Use the ServiceAccount named "cluster-autoscaler" +# in the "kube-system" workspace in the binding +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system +--- +# Define deployment rules for pods and ReplicaSets +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 # Number of pods to run + # Apply to pods where the app has a label called 'app' + # with value 'cluster-autoscaler' + selector: + matchLabels: + app: cluster-autoscaler + # Definition of created pods + template: + metadata: + labels: + app: cluster-autoscaler + # Allow Prometheus to collect monitoring data over port 8085 + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8085" + spec: + priorityClassName: system-cluster-critical + securityContext: + # The Kubelet must be run as a non-root user + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + # Use the default seccomp profile as specified by the + # container runtime + seccompProfile: + type: RuntimeDefault + serviceAccountName: cluster-autoscaler + # The container(s) to run within the pod. + # Since we're running an autoscaler, we'll run the autoscaler + # as the pod's only container, and then we'll deploy other + # containers within the autoscaler to actually do work + containers: + # The main container for the pod will be the + # Kubernetes autoscaling container + - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.2 + name: cluster-autoscaler + resources: + # Maximum amount of compute resources allowed + limits: + cpu: 100m + memory: 600Mi + # Minimum amount of compute resources required + # Defaults to 'limits' if not specified + requests: + cpu: 100m + memory: 600Mi + command: + - ./cluster-autoscaler + - --v=4 + - --stderrthreshold=info + - --cloud-provider=aws + - --skip-nodes-with-local-storage=false + - --expander=least-waste + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/escience-2025-tutorial + volumeMounts: + # Mount the CA SSL/TLS certificates into the container + - name: ssl-certs + mountPath: /etc/ssl/certs/ca-certificates.crt + readOnly: true + # Always pull the digest of the image from the + # container registry. If the locally cached digest is + # the same as the pulled digest, use the cached container image. + # Otherwise, pull the container from the registry + imagePullPolicy: "Always" + securityContext: + # Don't let the pod have more privileges than the + # parent process + allowPrivilegeEscalation: false + capabilities: + # Remove all capabilities + drop: + - ALL + # Root filesystem (i.e., '/') is read-only + readOnlyRootFilesystem: true + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/2025-eScience/infrastructure/production/config.toml b/2025-eScience/infrastructure/production/config.toml new file mode 100644 index 0000000..35ed978 --- /dev/null +++ b/2025-eScience/infrastructure/production/config.toml @@ -0,0 +1,54 @@ +tutorial_name = "escience-2025-tutorial" + +[aws.eksctl] +cluster_name = "escience-2025-tutorial" +cluster_deployment_region = "us-east-1" +cluster_availability_zones = [ + "us-east-1a", + "us-east-1b", +] + +[[aws.eksctl.cluster_node_groups]] +zone = "us-east-1a" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 15 +min_size = 15 +max_size = 100 + +[[aws.eksctl.cluster_node_groups]] +zone = "us-east-1b" +instance_type = "c7i.12xlarge" +volume_size = 30 +desired_size = 15 +min_size = 15 +max_size = 100 + +[aws."Kubernetes autoscaler"] +cpu_max = "100m" +memory_max = "600Mi" +cpu_min = "100m" +memory_min = "600Mi" + +[aws.Helm] +max_concurrent_users = 30 +hub_password = "hpctutorial25" +hub_db_capacity = "32Gi" +ebs_storage_type = "gp3" +hub_container_image = "jupyterhub/k8s-hub" +hub_container_tag = "4.2.0" +spawner_container_image = "ghcr.io/llnl/reproducible-benchmarking-spawn" +spawner_container_tag = "escience-2025" +spawner_image_entrypoint = "/entrypoint.sh 32" +cpu_min = "32" +cpu_max = "32" +mem_min = "64G" +mem_max = "64G" +provide_extra_shmem = true +init_container_image = "ghcr.io/llnl/reproducible-benchmarking-init" +init_container_tag = "escience-2025" +init_image_entrypoint = "/entrypoint.sh" + +[aws."utility scripts"] +jupyterhub_helm_version = "4.2.0" +ebs_csidriver_version = "v1.45.0" diff --git a/2025-eScience/infrastructure/production/configure_kubernetes.sh b/2025-eScience/infrastructure/production/configure_kubernetes.sh new file mode 100755 index 0000000..5c4bee6 --- /dev/null +++ b/2025-eScience/infrastructure/production/configure_kubernetes.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +echo "Configuring the Cluster Autoscaler:" +kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=v1.45.0" +kubectl apply -f ./cluster-autoscaler.yaml +echo "" +echo "Configuring the Storage Class:" +kubectl apply -f ./storage-class.yaml + +echo "" +echo "Patching the cluster to make the configured storage class the default:" +kubectl patch storageclass gp3 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' + +echo "" +echo "Done configuring Kubernetes!" +echo "" +echo "Next, you should run deploy_jupyterhub.sh to actually deploy JupyterHub and the tutorial." \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/create_cluster.sh b/2025-eScience/infrastructure/production/create_cluster.sh new file mode 100755 index 0000000..f631168 --- /dev/null +++ b/2025-eScience/infrastructure/production/create_cluster.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v eksctl >/dev/null 2>&1; then + echo "ERROR: 'eksctl' is required to create a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://eksctl.io/installation/" + exit 1 +fi + +echo "Creating EKS cluster with eksctl:" +eksctl create cluster --config-file ./eksctl-config.yaml + +echo "Done creating the EKS cluster!" +echo "" +echo "Next, you should run configure_kubernetes.sh to configure Kubernetes on the cluster." \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/deploy_jupyterhub.sh b/2025-eScience/infrastructure/production/deploy_jupyterhub.sh new file mode 100755 index 0000000..5a391e9 --- /dev/null +++ b/2025-eScience/infrastructure/production/deploy_jupyterhub.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +echo "Adding JupyterHub to EKS cluster using Helm:" +helm repo add jupyterhub https://hub.jupyter.org/helm-chart/ +helm repo update +echo "" +echo "Installing the Helm chart and deploying JupyterHub to EKS:" +helm install escience-2025-tutorial-jupyter jupyterhub/jupyterhub --version 4.2.0 --values ./helm-config.yaml + +echo "" +echo "Done deploying JupyterHub!" +echo "" +echo "Next, you should ensure all the pods spawned correctly with check_jupyterhub_status.sh," +echo "and you should get the cluster URL with get_jupyterhub_url.sh." +echo "" +echo "If something went wrong, you can edit the helm-config.yaml file to try to fix the issue." +echo "After editing helm-config.yaml, you can normally reconfigure and relaunch JupyterHub using" +echo "the update_jupyterhub_deployment.sh script. If that doesn't work or if you need to edit" +echo "storage-class.yaml or cluster-autoscaler.yaml, you should first tear down JupyterHub with" +echo "tear_down_jupyterhub.sh, and then you should bring Jupyter back up by rerunning deploy_jupyterhub.sh." +echo "" +echo "If everything went smoothly, the cluster URL is what you should share with attendees." +echo "" +echo "Attendees can get a Jupyter environment to work in by going to that URL and logging in" +echo "with a username of their choice and the password specified in helm-config.yaml." +echo "" +echo "Note: users should have unique usernames. If two users have the same username, they will" +echo " share the same pod." +echo "" +echo "After you are done with your tutorial, you should finally run cleanup.sh to bring down" +echo "the EKS cluster and all associated resources." diff --git a/2025-eScience/infrastructure/production/eksctl-config.yaml b/2025-eScience/infrastructure/production/eksctl-config.yaml new file mode 100644 index 0000000..6a7ea06 --- /dev/null +++ b/2025-eScience/infrastructure/production/eksctl-config.yaml @@ -0,0 +1,110 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +# Define the name of the cluster and the deployment region +metadata: + name: escience-2025-tutorial + region: us-east-1 + +# Create the IAM policies needed to enable the autoscaler and storage +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: cluster-autoscaler + + # https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "autoscaling:DescribeAutoScalingGroups" + - "autoscaling:DescribeAutoScalingInstances" + - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeTags" + - "autoscaling:SetDesiredCapacity" + - "autoscaling:TerminateInstanceInAutoScalingGroup" + - "ec2:DescribeLaunchTemplateVersions" + Resource: "*" + + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: "*" + +# Specify the availability zone from which nodes will be obtained +availabilityZones: +- "us-east-1a" +- "us-east-1b" + + +# Define rules for nodegroups for each availability zone +managedNodeGroups: + + - name: node-group-us-east-1a + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 15 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 15 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 100 + privateNetworking: true + availabilityZones: + - us-east-1a + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: node-group-us-east-1b + # Set policies/permissions to autoscale + iam: + withAddonPolicies: + autoScaler: true + # Instance type to allocate + instanceType: c7i.12xlarge + # Size of storage volume for the availability zone, in gigabytes + volumeSize: 30 + # Number of nodes to start with in this availability zone + desiredCapacity: 15 + # Minimum number of nodes that will always be allocated in this availability zone + minSize: 15 + # Maximum number of nodes that will every be allocated in this availability zone + maxSize: 100 + privateNetworking: true + availabilityZones: + - us-east-1b + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" diff --git a/2025-eScience/infrastructure/production/get_jupyterhub_url.sh b/2025-eScience/infrastructure/production/get_jupyterhub_url.sh new file mode 100755 index 0000000..ddfd250 --- /dev/null +++ b/2025-eScience/infrastructure/production/get_jupyterhub_url.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: 'kubectl' is required to configure a Kubernetes cluster on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://kubernetes.io/docs/tasks/tools/#kubectl" + exit 1 +fi + +kubectl get -o json service proxy-public | jq '.status.loadBalancer.ingress[0].hostname' \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/helm-config.yaml b/2025-eScience/infrastructure/production/helm-config.yaml new file mode 100644 index 0000000..b259210 --- /dev/null +++ b/2025-eScience/infrastructure/production/helm-config.yaml @@ -0,0 +1,121 @@ +# Uncomment if you need to debug your deployment of Jupyter. +# For more information on debugging, see: +# https://z2jh.jupyter.org/en/stable/administrator/debug.html +# debug: +# enabled: true + +hub: + # Maximum number of users with spawned JupyterLab environments (i.e., pods) at a time + concurrentSpawnLimit: 30 + config: + # Define a password for login + DummyAuthenticator: + password: hpctutorial25 + JupyterHub: + admin_access: true + authenticator_class: dummy + + # Define storage quantity for JupyterHub's persistent database + # We could explicitly set storage class name here, + # but we won't because we've marked the storage class defined + # in storage-class.yaml as default + db: + pvc: + storage: 32Gi + storageClassName: gp3 + + # Specify the hub image for the tutorial. + # The hub image should be based off of the jupyterhub/k8s-hub image. + # Its job is twofold: + # 1) If desired, replace the login page (at /usr/local/share/jupyterhub/templates/login.html) with a custom HTML login page + # 2) Set the user + image: + name: jupyterhub/k8s-hub + tag: "4.2.0" + pullPolicy: Always + + # Define resource usage for JupyterHub + # For large tutorials, it is recommended to set these higher + + # We are just using defualt resource usage + + + # Define custom hostname for JupyterHub + + # We are not using a custom hostname + + +# Based on optimization recommendations from: +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +# scheduling: +# podPriority: +# enabled: true +# userPlaceholder: +# replicas: 3 + +# Define the spawner and init containers for each attendee's pod +singleuser: + # Specify the spawner image for the tutorial. + # The spawner image should do the following: + # 1) Install any necessary software + # 2) Define the user for the tutorial (we usually default to jovyan) + # 3) If custom Python packages are needed, it's often recommended to install a custom Jupyter kernel with `IPython kernel install` + # 4) If you want a custom Jupyter launcher UI, install the appropriate packages and update JUPYTER_APP_LAUNCHER_PATH + # 5) Copy any necessary local scripts or files and ensure proper permissions + image: + name: ghcr.io/llnl/reproducible-benchmarking-spawn + tag: "escience-2025" + pullPolicy: Always + # Specify the minimum (i.e., guarantee) and maximum (i.e., limit) amount of resources per user + cpu: + limit: 32 + guarantee: 32 + memory: + limit: "64G" + guarantee: "64G" + # If needed, specify a custom entrypoint into the spawner image. + # For more information, look at the documentation for Docker ENTRYPOINT and CMD directives: + # https://www.docker.com/blog/docker-best-practices-choosing-between-run-cmd-and-entrypoint/ + cmd: ["/entrypoint.sh", "32"] + # Specify the init image for the tutorial. + # This image is optional, but it can be used to do last second configuration or installation of files + # before the user gains control of the pod. + # + # A good usecase for the init image is to set permissions and ensure the tutorial user will be able to + # access the files for your tutorial. An example Dockerfile for the init image may look like: + # + # Dockerfile: + # FROM alpine/git + # ENV NB_USER=jovyan \ + # NB_UID=1000 \ + # HOME=/home/jovyan + # + # RUN adduser \ + # -D \ + # -g "Default user" \ + # -u ${NB_UID} \ + # -h ${HOME} \ + # ${NB_USER} + # + # COPY ./init-entrypoint.sh /entrypoint.sh + # + # The 'command' field for the init container specifies the entrypoint for the container. For the Dockerfile + # above, the entrypoint should be "/entrypoint.sh". This script could look something like this: + # + # entrypoint.sh (would be ./init-entrypoint.sh on your local computer) + # chown -R 1000 /home/jovyan + initContainers: + - name: init-tutorial-service + image: ghcr.io/llnl/reproducible-benchmarking-init:escience-2025 + command: ["/entrypoint.sh"] + imagePullPolicy: Always + storage: + type: none + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm + diff --git a/2025-eScience/infrastructure/production/storage-class.yaml b/2025-eScience/infrastructure/production/storage-class.yaml new file mode 100644 index 0000000..b83a030 --- /dev/null +++ b/2025-eScience/infrastructure/production/storage-class.yaml @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 +provisioner: kubernetes.io/aws-ebs +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete \ No newline at end of file diff --git a/2025-eScience/infrastructure/production/tear_down_jupyterhub.sh b/2025-eScience/infrastructure/production/tear_down_jupyterhub.sh new file mode 100755 index 0000000..d748b69 --- /dev/null +++ b/2025-eScience/infrastructure/production/tear_down_jupyterhub.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm uninstall escience-2025-tutorial-jupyter + +echo "Helm's JupyterHub deployment is torn down." +echo "If any attendee pods are remaining, you can delete them with 'kubectl delete pod '" +echo "" +echo "To recreate the JupyterHub deployment, just run deploy_jupyterhub.sh again." diff --git a/2025-eScience/infrastructure/production/update_jupyterhub_deployment.sh b/2025-eScience/infrastructure/production/update_jupyterhub_deployment.sh new file mode 100755 index 0000000..d2d2add --- /dev/null +++ b/2025-eScience/infrastructure/production/update_jupyterhub_deployment.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -e + +if ! command -v helm >/dev/null 2>&1; then + echo "ERROR: 'helm' is required to configure and launch JupyterHub on AWS with this script!" + echo " Installation instructions can be found here:" + echo " https://helm.sh/docs/intro/install/" + exit 1 +fi + +helm upgrade escience-2025-tutorial-jupyter jupyterhub/jupyterhub --values ./helm-config.yaml + +echo "The JupyterHub deployment is updated!" diff --git a/2025-eScience/infrastructure/stub.txt b/2025-eScience/infrastructure/stub.txt new file mode 100644 index 0000000..1d3fcd6 --- /dev/null +++ b/2025-eScience/infrastructure/stub.txt @@ -0,0 +1 @@ +This is a stub file to make sure the directory appears in GitHub. \ No newline at end of file diff --git a/2025-eScience/slides/stub.txt b/2025-eScience/slides/stub.txt new file mode 100644 index 0000000..1d3fcd6 --- /dev/null +++ b/2025-eScience/slides/stub.txt @@ -0,0 +1 @@ +This is a stub file to make sure the directory appears in GitHub. \ No newline at end of file diff --git a/2025-eScience/tutorial-code/caliper-tutorial b/2025-eScience/tutorial-code/caliper-tutorial new file mode 160000 index 0000000..942c1a0 --- /dev/null +++ b/2025-eScience/tutorial-code/caliper-tutorial @@ -0,0 +1 @@ +Subproject commit 942c1a0da4e9ef5f809d03f15d97c938ea4c6531 diff --git a/2025-eScience/tutorial-code/system-description/AWS_Tutorial-c7i-EFA/hardware_description.yaml b/2025-eScience/tutorial-code/system-description/AWS_Tutorial-c7i-EFA/hardware_description.yaml new file mode 100644 index 0000000..c7b89e4 --- /dev/null +++ b/2025-eScience/tutorial-code/system-description/AWS_Tutorial-c7i-EFA/hardware_description.yaml @@ -0,0 +1,31 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +system_definition: + name: AWS_Tutorial-c7i-EFA + integrator: + vendor: AWS + name: EKS + processor: + vendor: Intel + name: Xeon Sapphire Rapids + ISA: x86_64 + uArch: SapphireRapids + accelerator: + vendor: + name: + ISA: + uArch: + interconnect: + vendor: AWS + name: EFA + systems-tested: + aws-pcluster: + os: ubuntu-22.04 + scheduler: flux + compiler: gcc + runtime: + mpi: mpich + instance-types: https://aws.amazon.com/ec2/instance-types/c7i/ diff --git a/2025-eScience/tutorial-code/system-description/aws-tutorial/system.py b/2025-eScience/tutorial-code/system-description/aws-tutorial/system.py new file mode 100644 index 0000000..b7ed705 --- /dev/null +++ b/2025-eScience/tutorial-code/system-description/aws-tutorial/system.py @@ -0,0 +1,264 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import json +import subprocess + +from benchpark.directives import maintainers, variant +from benchpark.openmpsystem import OpenMPCPUOnlySystem +from benchpark.paths import hardware_descriptions +from benchpark.system import System + + +class AwsTutorial(System): + # Taken from https://aws.amazon.com/ec2/instance-types/ + # With boto3, we could determine this dynamically vs. storing a static table + + maintainers("stephanielam3211") + + id_to_resources = { + "c7i.48xlarge": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, + "c7i.metal-48xl": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, + "c7i.24xlarge": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, + "c7i.metal-24xl": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, + "c7i.12xlarge": { + "system_site": "aws", + "hardware_key": str(hardware_descriptions) + + "/AWS_Tutorial-zen-EFA/hardware_description.yaml", + }, + } + + variant( + "instance_type", + values=("c7i.48xlarge", "c7i.metal-48xl", "c7i.24xlarge", "c7i.metal-24xl", "c7i.12xlarge"), + default="c7i.24xlarge", + description="AWS instance type", + ) + + def __init__(self, spec): + super().__init__(spec) + self.programming_models = [OpenMPCPUOnlySystem()] + + self.scheduler = "flux" + # TODO: for some reason I have to index to get value, even if multi=False + attrs = self.id_to_resources.get(self.spec.variants["instance_type"][0]) + for k, v in attrs.items(): + setattr(self, k, v) + + json_resource_spec = subprocess.check_output("flux resource R", shell=True) + resource_dict = json.loads(json_resource_spec) + self.sys_cores_per_node = resource_dict["execution"]["R_lite"][0]["children"][ + "core" + ] + self.sys_cores_per_node = [int(c) for c in self.sys_cores_per_node.split("-")] + self.sys_cores_per_node[-1] += 1 + self.sys_cores_per_node = len(list(range(*self.sys_cores_per_node))) + self.sys_nodes = resource_dict["execution"]["R_lite"][0]["rank"] + self.sys_nodes = [int(n) for n in self.sys_nodes.split("-")] + self.sys_nodes[-1] += 1 + self.sys_nodes = len(list(range(*self.sys_nodes))) + + # def system_specific_variables(self): + # return { + # "extra_cmd_opts": '--mpi=pmix --export=ALL,FI_EFA_USE_DEVICE_RDMA=1,FI_PROVIDER="efa",OMPI_MCA_mtl_base_verbose=100', + # } + + def compute_packages_section(self): + return { + "packages": { + "tar": { + "externals": [{"spec": "tar@1.34", "prefix": "/usr"}], + "buildable": False, + }, + "gmake": {"externals": [{"spec": "gmake@4.3", "prefix": "/usr"}]}, + "blas": { + "externals": [{"spec": "blas@0.29.2", "prefix": "/usr"}], + "buildable": False, + }, + "lapack": { + "externals": [{"spec": "lapack@0.29.2", "prefix": "/usr"}], + "buildable": False, + }, + "mpi": {"buildable": False}, + "openmpi": { + "externals": [ + { + "spec": "openmpi@4.0%gcc@11.4.0", + "prefix": "/usr", + } + ] + }, + "cmake": { + "externals": [{"spec": "cmake@4.0.2", "prefix": "/usr"}], + "buildable": False, + }, + "git": { + "externals": [{"spec": "git@2.34.1~tcltk", "prefix": "/usr"}], + "buildable": False, + }, + "openssl": { + "externals": [{"spec": "openssl@3.0.2", "prefix": "/usr"}], + "buildable": False, + }, + "automake": { + "externals": [{"spec": "automake@1.16.5", "prefix": "/usr"}], + "buildable": False, + }, + "openssh": { + "externals": [{"spec": "openssh@8.9p1", "prefix": "/usr"}], + "buildable": False, + }, + "m4": { + "externals": [{"spec": "m4@1.4.18", "prefix": "/usr"}], + "buildable": False, + }, + "sed": { + "externals": [{"spec": "sed@4.8", "prefix": "/usr"}], + "buildable": False, + }, + "autoconf": { + "externals": [{"spec": "autoconf@2.71", "prefix": "/usr"}], + "buildable": False, + }, + "diffutils": { + "externals": [{"spec": "diffutils@3.8", "prefix": "/usr"}], + "buildable": False, + }, + "coreutils": { + "externals": [{"spec": "coreutils@8.32", "prefix": "/usr"}], + "buildable": False, + }, + "findutils": { + "externals": [{"spec": "findutils@4.8.0", "prefix": "/usr"}], + "buildable": False, + }, + "binutils": { + "externals": [ + {"spec": "binutils@2.38+gold~headers", "prefix": "/usr"} + ], + "buildable": False, + }, + "perl": { + "externals": [ + { + "spec": "perl@5.34.0~cpanm+opcode+open+shared+threads", + "prefix": "/usr", + } + ], + "buildable": False, + }, + "caliper": { + "externals": [ + { + "spec": "caliper@master%gcc@11.4.0+adiak+mpi", + "prefix": "/usr", + } + ], + "buildable": False, + }, + "adiak": { + "externals": [{"spec": "adiak@0.4.1", "prefix": "/usr"}], + "buildable": False, + }, + "groff": { + "externals": [{"spec": "groff@1.22.4", "prefix": "/usr"}], + "buildable": False, + }, + "curl": { + "externals": [ + {"spec": "curl@7.81.0+gssapi+ldap+nghttp2", "prefix": "/usr"} + ], + "buildable": False, + }, + "ccache": { + "externals": [{"spec": "ccache@4.5.1", "prefix": "/usr"}], + "buildable": False, + }, + "flex": { + "externals": [{"spec": "flex@2.6.4+lex", "prefix": "/usr"}], + "buildable": False, + }, + "pkg-config": { + "externals": [{"spec": "pkg-config@0.29.2", "prefix": "/usr"}], + "buildable": False, + }, + "zlib": { + "externals": [{"spec": "zlib@1.2.11", "prefix": "/usr"}], + "buildable": False, + }, + "ninja": { + "externals": [{"spec": "ninja@1.10.1", "prefix": "/usr"}], + "buildable": False, + }, + "libtool": { + "externals": [{"spec": "libtool@2.4.6", "prefix": "/usr"}], + "buildable": False, + }, + } + } + + def compute_compilers_section(self): + return { + "compilers": [ + { + "compiler": { + "spec": "gcc@11.4.0", + "paths": { + "cc": "/usr/bin/gcc", + "cxx": "/usr/bin/g++", + "f77": "/usr/bin/gfortran-11", + "fc": "/usr/bin/gfortran-11", + }, + "flags": {}, + "operating_system": "ubuntu22.04", + "target": "x86_64", + "modules": [], + "environment": {}, + "extra_rpaths": [], + } + } + ] + } + + def compute_software_section(self): + return { + "software": { + "packages": { + "default-compiler": {"pkg_spec": "gcc@11.4.0"}, + "default-mpi": {"pkg_spec": "openmpi@4.0%gcc@11.4.0"}, + "compiler-gcc": {"pkg_spec": "gcc@11.4.0"}, + "lapack": {"pkg_spec": "lapack@0.29.2"}, + "mpi-gcc": {"pkg_spec": "openmpi@4.0%gcc@11.4.0"}, + } + } + } + + def compute_spack_config_section(self): + return { + "config": {}, + "concretizer": {}, + "modules": {}, + "packages": {}, + "repos": [], + "compilers": [], + "mirrors": {}, + "providers": {"mpi": ["openmpi"]}, + } diff --git a/2025-eScience/tutorial-code/thicket-tutorial b/2025-eScience/tutorial-code/thicket-tutorial new file mode 160000 index 0000000..169e9c8 --- /dev/null +++ b/2025-eScience/tutorial-code/thicket-tutorial @@ -0,0 +1 @@ +Subproject commit 169e9c86250fae999c6d4433d66b56e1b042bb70