Skip to content

CUDA Callback API (HAPI) #282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions charm4py/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def __init__(self):
self.CkChareSend = self.lib.CkChareSend
self.CkGroupSend = self.lib.CkGroupSend
self.CkArraySend = self.lib.CkArraySend
self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback
self.reducers = reduction.ReducerContainer(self)
self.redMgr = reduction.ReductionManager(self, self.reducers)
self.mainchareRegistered = False
Expand Down Expand Up @@ -933,6 +934,10 @@ def recordSendRecv(self, stats, size):
stats[2] = max(size, stats[2])
stats[3] += size
stats[4] = size

# deposit value of one of the futures that was created on this PE
def _future_deposit_result(self, fid, result=None):
self.threadMgr.depositFuture(fid, result)

def __printTable__(self, table, sep):
col_width = [max(len(x) for x in col) for col in zip(*table)]
Expand Down
2 changes: 2 additions & 0 deletions charm4py/charmlib/ccharm.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ cdef extern from "charm.h":
void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);

void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid);

cdef extern from "conv-header.h":
ctypedef void (*CmiHandler)(void* )
cdef const int CmiReservedHeaderSize
Expand Down
9 changes: 9 additions & 0 deletions charm4py/charmlib/charmlib_cython.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,15 @@ class CharmLib(object):
cdef int replyLen = len(message_bytes)
CcsSendReply(replyLen, <const void*>replyData)

def hapiAddCudaCallback(self, stream, future):
if not HAVE_CUDA_BUILD:
raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support")
id = future.fid
CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)

cdef void depositFutureWithId(void *param, void* message) noexcept:
cdef int futureId = <int> param
charm._future_deposit_result(futureId, None)

# first callback from Charm++ shared library
cdef void registerMainModule() noexcept:
Expand Down
91 changes: 91 additions & 0 deletions docs/gpus.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
====
GPUs
====

.. .. contents::


GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface.
Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures,
using the function ``charm.hapiAddCudaCallback``.

The HAPI Charm4py API is:

.. code-block:: python

def hapiAddCudaCallback(stream, future)

.. note::

For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback
into the stream such that when the callback is reached, the corresponding Charm4py future is set.

Enabling HAPI
--------
To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source:

.. code-block:: shell

export CHARM_EXTRA_BUILD_OPTS="cuda"
pip install .

.. warning::

To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install.

Examples
--------

.. code-block:: python

from charm4py import charm
import time
import numba.cuda as cuda
import numpy as np

@cuda.jit
def elementwise_sum_kernel(x_in, x_out):
idx = cuda.grid(1)
if idx < x_in.shape[0]:
x_out[idx] = x_in[idx] + x_in[idx]

def main(args):
N = 1_000_000
array_size = (N,)

s = cuda.stream()
stream_handle = s.handle.value

A_host = np.arange(N, dtype=np.float32)

A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
A_gpu.copy_to_device(A_host, stream=s)

threads_per_block = 128
blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block

print("Launching kernel and inserting callback...")
start_time = time.perf_counter()
elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)

return_fut = charm.Future()
charm.hapiAddCudaCallback(stream_handle, return_fut)
return_fut.get()
kernel_done_time = time.perf_counter()
print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")

B_host = B_gpu.copy_to_host(stream=s)

s.synchronize()

sum_result = np.sum(B_host)
print(f"Sum of result is {sum_result}")

charm.exit()

charm.start(main)


The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track
completion of a numba kernel launch.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ to the largest supercomputers.
sections
pool
rules
gpus

.. toctree::
:maxdepth: 2
Expand Down
27 changes: 27 additions & 0 deletions examples/cuda/hapi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## Using Charm4py with CUDA

### HAPI CUDA Callback

Example overview

- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library
- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures
- The example is based around a simple torch kernel.

Usage

- hapiAddCudaCallback requires a cuda stream handle and a future
- access to the Cuda stream handle depends on the Python library being used. For example...
- using torch: `stream_handle = torch.cuda.Stream().cuda_stream`
- using numba: `stream_handle = numba.cuda.stream().handle.value`
- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams.

Running example

- If running locally, use:

`$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py`

- If running on a cluster machine with Slurm, use:

`$ srun -n <N> python3 hapi-cuda-callback.py`
47 changes: 47 additions & 0 deletions examples/cuda/hapi/hapi-cuda-callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from charm4py import charm
import time
import numba.cuda as cuda
import numpy as np

@cuda.jit
def elementwise_sum_kernel(x_in, x_out):
idx = cuda.grid(1)
if idx < x_in.shape[0]:
x_out[idx] = x_in[idx] + x_in[idx]

def main(args):
N = 1_000_000
array_size = (N,)

s = cuda.stream()
stream_handle = s.handle.value

A_host = np.arange(N, dtype=np.float32)

A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
A_gpu.copy_to_device(A_host, stream=s)

threads_per_block = 128
blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block

print("Launching kernel and inserting callback...")
start_time = time.perf_counter()
elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)

return_fut = charm.Future()
charm.hapiAddCudaCallback(stream_handle, return_fut)
return_fut.get()
kernel_done_time = time.perf_counter()
print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")

B_host = B_gpu.copy_to_host(stream=s)

s.synchronize()

sum_result = np.sum(B_host)
print(f"Sum of result is {sum_result}")

charm.exit()

charm.start(main)
10 changes: 7 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,15 +325,18 @@ def install(self):
cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"]
else:
cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"]


cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1

extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython',
sources=['charm4py/charmlib/charmlib_cython.pyx'],
include_dirs=['charm_src/charm/include'] + my_include_dirs,
library_dirs=[os.path.join(os.getcwd(), 'charm4py', '.libs')],
libraries=["charm"],
extra_compile_args=[],
extra_link_args=extra_link_args,
), compile_time_env={'HAVE_NUMPY': haveNumpy}))
), compile_time_env={'HAVE_NUMPY': haveNumpy,
'HAVE_CUDA_BUILD': cudaBuild}))

extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store',
sources=['charm4py/c_object_store.pyx'],
Expand All @@ -342,7 +345,8 @@ def install(self):
libraries=["charm"],
extra_compile_args=[],
extra_link_args=cobject_extra_args,
), compile_time_env={'HAVE_NUMPY': haveNumpy}))
), compile_time_env={'HAVE_NUMPY': haveNumpy,
'HAVE_CUDA_BUILD': cudaBuild}))


additional_setup_keywords = {}
Expand Down
Loading