diff --git a/charm4py/charm.py b/charm4py/charm.py index 396da365..983953c8 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -127,6 +127,7 @@ def __init__(self): self.CkChareSend = self.lib.CkChareSend self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend + self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False @@ -933,6 +934,10 @@ def recordSendRecv(self, stats, size): stats[2] = max(size, stats[2]) stats[3] += size stats[4] = size + + # deposit value of one of the futures that was created on this PE + def _future_deposit_result(self, fid, result=None): + self.threadMgr.depositFuture(fid, result) def __printTable__(self, table, sep): col_width = [max(len(x) for x in col) for col in zip(*table)] diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index b870f5fe..b2af2e4a 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -70,6 +70,8 @@ cdef extern from "charm.h": void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); + void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid); + cdef extern from "conv-header.h": ctypedef void (*CmiHandler)(void* ) cdef const int CmiReservedHeaderSize diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 5d941f24..56b91390 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -867,6 +867,15 @@ class CharmLib(object): cdef int replyLen = len(message_bytes) CcsSendReply(replyLen, replyData) + def hapiAddCudaCallback(self, stream, future): + if not HAVE_CUDA_BUILD: + raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support") + id = future.fid + CkHapiAddCallback( stream, depositFutureWithId, id) + +cdef void depositFutureWithId(void *param, void* message) noexcept: + cdef int futureId = param + charm._future_deposit_result(futureId, None) # first callback from Charm++ shared library cdef void registerMainModule() noexcept: diff --git a/docs/gpus.rst b/docs/gpus.rst new file mode 100644 index 00000000..0cb1ebde --- /dev/null +++ b/docs/gpus.rst @@ -0,0 +1,91 @@ +==== +GPUs +==== + +.. .. contents:: + + +GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface. +Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures, +using the function ``charm.hapiAddCudaCallback``. + +The HAPI Charm4py API is: + +.. code-block:: python + + def hapiAddCudaCallback(stream, future) + +.. note:: + + For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback + into the stream such that when the callback is reached, the corresponding Charm4py future is set. + +Enabling HAPI +-------- +To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source: + +.. code-block:: shell + + export CHARM_EXTRA_BUILD_OPTS="cuda" + pip install . + +.. warning:: + + To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install. + +Examples +-------- + +.. code-block:: python + + from charm4py import charm + import time + import numba.cuda as cuda + import numpy as np + + @cuda.jit + def elementwise_sum_kernel(x_in, x_out): + idx = cuda.grid(1) + if idx < x_in.shape[0]: + x_out[idx] = x_in[idx] + x_in[idx] + + def main(args): + N = 1_000_000 + array_size = (N,) + + s = cuda.stream() + stream_handle = s.handle.value + + A_host = np.arange(N, dtype=np.float32) + + A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + A_gpu.copy_to_device(A_host, stream=s) + + threads_per_block = 128 + blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block + + print("Launching kernel and inserting callback...") + start_time = time.perf_counter() + elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu) + + return_fut = charm.Future() + charm.hapiAddCudaCallback(stream_handle, return_fut) + return_fut.get() + kernel_done_time = time.perf_counter() + print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.") + + B_host = B_gpu.copy_to_host(stream=s) + + s.synchronize() + + sum_result = np.sum(B_host) + print(f"Sum of result is {sum_result}") + + charm.exit() + + charm.start(main) + + +The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track +completion of a numba kernel launch. diff --git a/docs/index.rst b/docs/index.rst index 3d5e90a7..328b02c4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,6 +41,7 @@ to the largest supercomputers. sections pool rules + gpus .. toctree:: :maxdepth: 2 diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md new file mode 100644 index 00000000..f94f7bd2 --- /dev/null +++ b/examples/cuda/hapi/README.md @@ -0,0 +1,27 @@ +## Using Charm4py with CUDA + +### HAPI CUDA Callback + +Example overview + +- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library +- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures +- The example is based around a simple torch kernel. + +Usage + +- hapiAddCudaCallback requires a cuda stream handle and a future +- access to the Cuda stream handle depends on the Python library being used. For example... + - using torch: `stream_handle = torch.cuda.Stream().cuda_stream` + - using numba: `stream_handle = numba.cuda.stream().handle.value` +- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams. + +Running example + +- If running locally, use: + +`$ python3 -m charmrun.start +p hapi-cuda-callback.py` + +- If running on a cluster machine with Slurm, use: + +`$ srun -n python3 hapi-cuda-callback.py` diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py new file mode 100644 index 00000000..a7887e52 --- /dev/null +++ b/examples/cuda/hapi/hapi-cuda-callback.py @@ -0,0 +1,47 @@ +from charm4py import charm +import time +import numba.cuda as cuda +import numpy as np + +@cuda.jit +def elementwise_sum_kernel(x_in, x_out): + idx = cuda.grid(1) + if idx < x_in.shape[0]: + x_out[idx] = x_in[idx] + x_in[idx] + +def main(args): + N = 1_000_000 + array_size = (N,) + + s = cuda.stream() + stream_handle = s.handle.value + + A_host = np.arange(N, dtype=np.float32) + + A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + A_gpu.copy_to_device(A_host, stream=s) + + threads_per_block = 128 + blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block + + print("Launching kernel and inserting callback...") + start_time = time.perf_counter() + elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu) + + return_fut = charm.Future() + charm.hapiAddCudaCallback(stream_handle, return_fut) + return_fut.get() + kernel_done_time = time.perf_counter() + print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.") + + B_host = B_gpu.copy_to_host(stream=s) + + s.synchronize() + + sum_result = np.sum(B_host) + print(f"Sum of result is {sum_result}") + + charm.exit() + +charm.start(main) diff --git a/setup.py b/setup.py index 6b351907..e47d31eb 100644 --- a/setup.py +++ b/setup.py @@ -325,7 +325,9 @@ def install(self): cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"] else: cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"] - + + cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1 + extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython', sources=['charm4py/charmlib/charmlib_cython.pyx'], include_dirs=['charm_src/charm/include'] + my_include_dirs, @@ -333,7 +335,8 @@ def install(self): libraries=["charm"], extra_compile_args=[], extra_link_args=extra_link_args, - ), compile_time_env={'HAVE_NUMPY': haveNumpy})) + ), compile_time_env={'HAVE_NUMPY': haveNumpy, + 'HAVE_CUDA_BUILD': cudaBuild})) extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store', sources=['charm4py/c_object_store.pyx'], @@ -342,7 +345,8 @@ def install(self): libraries=["charm"], extra_compile_args=[], extra_link_args=cobject_extra_args, - ), compile_time_env={'HAVE_NUMPY': haveNumpy})) + ), compile_time_env={'HAVE_NUMPY': haveNumpy, + 'HAVE_CUDA_BUILD': cudaBuild})) additional_setup_keywords = {}