From c1bef50acf22652311d326a83c846742722834b0 Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Thu, 3 Oct 2024 14:28:27 -0500 Subject: [PATCH 01/14] ctypes and cython streams working --- charm4py/charmlib/ccharm.pxd | 2 +- charm4py/charmlib/charmlib_ctypes.py | 3 +++ charm4py/charmlib/charmlib_cython.pyx | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 5bbe1b05..e97e4287 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -69,7 +69,7 @@ cdef extern from "charm.h": void CkStartQDExt_ArrayCallback(int aid, int* idx, int ndims, int epIdx, int fid); void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); - + void CkHapiAddCallback(long stream, void *cb, void *msg); cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, diff --git a/charm4py/charmlib/charmlib_ctypes.py b/charm4py/charmlib/charmlib_ctypes.py index 72f8fadf..922cf85e 100644 --- a/charm4py/charmlib/charmlib_ctypes.py +++ b/charm4py/charmlib/charmlib_ctypes.py @@ -729,6 +729,9 @@ def CkGetPesOnPhysicalNode(self, node): def scheduleTagAfter(self, tag, msecs): self.lib.CcdCallFnAfter(self.CcdCallFnAfterCallback_cb, tag, c_double(msecs)) + + def hapiAddCallback(self, stream, callback): + self.lib.CkHapiAddCallback(stream, callback, None) def CcdCallFnAfterCallback(self, userParam, curWallTime): try: diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index e41bc5da..97f292fa 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -826,6 +826,9 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) + def hapiAddCallback(self, stream, callback): + CkHapiAddCallback( stream, callback, None) + # first callback from Charm++ shared library cdef void registerMainModule() noexcept: From 728457c202cd2c1ca2e031826fc19a02621e693a Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Thu, 3 Oct 2024 17:48:46 -0500 Subject: [PATCH 02/14] cython prelim implementation working! --- charm4py/charmlib/ccharm.pxd | 3 ++- charm4py/charmlib/charmlib_cython.pyx | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index e97e4287..e92f3ef3 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -69,7 +69,8 @@ cdef extern from "charm.h": void CkStartQDExt_ArrayCallback(int aid, int* idx, int ndims, int epIdx, int fid); void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); - void CkHapiAddCallback(long stream, void *cb, void *msg); + + void CkHapiAddCallback(long stream, void (*cb)(void*, void*), void *msg); cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 97f292fa..0091336a 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -826,9 +826,12 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) - def hapiAddCallback(self, stream, callback): - CkHapiAddCallback( stream, callback, None) + def hapiAddCallback(self, stream, fn): + CkHapiAddCallback( stream, testHapiCallback, fn) +cdef void testHapiCallback(void *f, void* message) noexcept: + print("testing hapi callback") + (f)() # first callback from Charm++ shared library cdef void registerMainModule() noexcept: From e2f505c43422cd96f37c6bc40b9abd84916005c0 Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Fri, 4 Oct 2024 17:01:52 -0500 Subject: [PATCH 03/14] change to support future --- charm4py/charmlib/ccharm.pxd | 2 +- charm4py/charmlib/charmlib_cython.pyx | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index e92f3ef3..1fd2aaea 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -70,7 +70,7 @@ cdef extern from "charm.h": void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); - void CkHapiAddCallback(long stream, void (*cb)(void*, void*), void *msg); + void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid); cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 0091336a..c009320b 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -826,12 +826,15 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) - def hapiAddCallback(self, stream, fn): - CkHapiAddCallback( stream, testHapiCallback, fn) - -cdef void testHapiCallback(void *f, void* message) noexcept: - print("testing hapi callback") - (f)() + def hapiAddCallback(self, stream, future): + id = future.fid + print("adding callback with future fid", id) + CkHapiAddCallback( stream, depositFutureWithId, id) + +cdef void depositFutureWithId(void *param, void* message) noexcept: + cdef int futureId = param + print("future fid received, depositing: ", futureId) + charm._future_deposit_result(futureId) # first callback from Charm++ shared library cdef void registerMainModule() noexcept: From fb63125912fe35b5aa6fc1d395059181707a7c0d Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Wed, 9 Oct 2024 09:19:44 -0500 Subject: [PATCH 04/14] hapi works via futures now --- charm4py/charm.py | 4 ++++ charm4py/charmlib/charmlib_cython.pyx | 6 ++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index 47a9b1f4..31805ed4 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -934,6 +934,10 @@ def recordSendRecv(self, stats, size): stats[2] = max(size, stats[2]) stats[3] += size stats[4] = size + + # deposit value of one of the futures that was created on this PE + def _future_deposit_result(self, fid, result=None): + self.threadMgr.depositFuture(fid, result) def __printTable__(self, table, sep): col_width = [max(len(x) for x in col) for col in zip(*table)] diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index c009320b..07a608d9 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -826,15 +826,13 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) - def hapiAddCallback(self, stream, future): + def hapiAddCudaCallback(self, stream, future): id = future.fid - print("adding callback with future fid", id) CkHapiAddCallback( stream, depositFutureWithId, id) cdef void depositFutureWithId(void *param, void* message) noexcept: cdef int futureId = param - print("future fid received, depositing: ", futureId) - charm._future_deposit_result(futureId) + charm._future_deposit_result(futureId, 1) # first callback from Charm++ shared library cdef void registerMainModule() noexcept: From d031d3288d1ababfa22991044cf1964276d54b77 Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Wed, 30 Oct 2024 10:19:52 -0500 Subject: [PATCH 05/14] cleanup and example --- charm4py/charmlib/charmlib_cython.pyx | 2 +- examples/cuda/README.md | 17 +++++++++++++ examples/cuda/hapi-cuda-callback.py | 36 +++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 examples/cuda/README.md create mode 100644 examples/cuda/hapi-cuda-callback.py diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 07a608d9..6cd9c14c 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -832,7 +832,7 @@ class CharmLib(object): cdef void depositFutureWithId(void *param, void* message) noexcept: cdef int futureId = param - charm._future_deposit_result(futureId, 1) + charm._future_deposit_result(futureId, None) # first callback from Charm++ shared library cdef void registerMainModule() noexcept: diff --git a/examples/cuda/README.md b/examples/cuda/README.md new file mode 100644 index 00000000..b13dca66 --- /dev/null +++ b/examples/cuda/README.md @@ -0,0 +1,17 @@ +## Using Charm4py with CUDA + +### HAPI CUDA Callback + +Example overview + +- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library +- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures +- The example is based around a simple torch kernel. + +Usage + +- hapiAddCudaCallback requires a cuda stream handle and a future +- access to the Cuda stream handle depends on the Python library being used. For example... + - using torch: `stream_handle = torch.cuda.Stream().cuda_stream` + - using numba: `stream_handle = numba.cuda.stream().handle.value` +- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams. diff --git a/examples/cuda/hapi-cuda-callback.py b/examples/cuda/hapi-cuda-callback.py new file mode 100644 index 00000000..0deb87d8 --- /dev/null +++ b/examples/cuda/hapi-cuda-callback.py @@ -0,0 +1,36 @@ +from charm4py import charm +import time +import torch + +# using numba requires the following stream handle +# import numba.cuda as cuda +# s = cuda.stream() +# stream_handle = s.handle.value + +def main(args): + cuda = torch.device('cuda') + s = torch.cuda.Stream() # Create a new stream. + A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0) + + print("Starting computation and inserting callback") + start_time = time.perf_counter() + with torch.cuda.stream(s): + B = torch.sum(A) + + # create future to track cuda stream + return_fut = charm.Future() + stream_handle = s.cuda_stream + charm.lib.hapiAddCudaCallback(stream_handle, return_fut) + + # other work can be overlapped with kernel here + + return_fut.get() + + sum = B.cpu().item() + elapsed = time.perf_counter() - start_time + print(f"Kernel done in {elapsed} seconds. Sum result is {sum}") + + charm.exit() + + +charm.start(main) \ No newline at end of file From 39e09ba9963179f4b451d143ad598647e998744a Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 25 Apr 2025 11:32:11 -0500 Subject: [PATCH 06/14] make separate directory --- examples/cuda/{ => hapi}/README.md | 0 examples/cuda/{ => hapi}/hapi-cuda-callback.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename examples/cuda/{ => hapi}/README.md (100%) rename examples/cuda/{ => hapi}/hapi-cuda-callback.py (100%) diff --git a/examples/cuda/README.md b/examples/cuda/hapi/README.md similarity index 100% rename from examples/cuda/README.md rename to examples/cuda/hapi/README.md diff --git a/examples/cuda/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py similarity index 100% rename from examples/cuda/hapi-cuda-callback.py rename to examples/cuda/hapi/hapi-cuda-callback.py From e489a75dcf62006ffcb6041b8271f8872cf84652 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 25 Apr 2025 12:32:21 -0500 Subject: [PATCH 07/14] Make example numba --- examples/cuda/hapi/hapi-cuda-callback.py | 67 ++++++++++++++---------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py index 0deb87d8..1e2801ce 100644 --- a/examples/cuda/hapi/hapi-cuda-callback.py +++ b/examples/cuda/hapi/hapi-cuda-callback.py @@ -1,36 +1,47 @@ from charm4py import charm import time -import torch - -# using numba requires the following stream handle -# import numba.cuda as cuda -# s = cuda.stream() -# stream_handle = s.handle.value - -def main(args): - cuda = torch.device('cuda') - s = torch.cuda.Stream() # Create a new stream. - A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0) - - print("Starting computation and inserting callback") +import numba.cuda as cuda +import numpy as np + +@cuda.jit +def elementwise_sum_kernel(x_in, x_out): + idx = cuda.grid(1) + if idx < x_in.shape[0]: + x_out[idx] = x_in[idx] + x_in[idx] + +def main(args): + N = 1_000_000 + array_size = (N,) + + s = cuda.stream() + stream_handle = s.handle.value + + A_host = np.arange(N, dtype=np.float32) + + A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + A_gpu.copy_to_device(A_host, stream=s) + + threads_per_block = 128 + blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block + + print("Launching kernel and inserting callback...") start_time = time.perf_counter() - with torch.cuda.stream(s): - B = torch.sum(A) - - # create future to track cuda stream + elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu) + return_fut = charm.Future() - stream_handle = s.cuda_stream - charm.lib.hapiAddCudaCallback(stream_handle, return_fut) - - # other work can be overlapped with kernel here - + charm.hapiAddCudaCallback(stream_handle, return_fut) return_fut.get() - - sum = B.cpu().item() - elapsed = time.perf_counter() - start_time - print(f"Kernel done in {elapsed} seconds. Sum result is {sum}") - + kernel_done_time = time.perf_counter() + print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.") + + B_host = B_gpu.copy_to_host(stream=s) + + s.synchronize() + + sum_result = np.sum(B_host) + print(f"Sum of result is {sum_result}") + charm.exit() - charm.start(main) \ No newline at end of file From 2053aa8126d72f4b29a78100e8711b8e60b27c44 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 25 Apr 2025 12:32:31 -0500 Subject: [PATCH 08/14] Set hapiAddCudaCallback to charm --- charm4py/charm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/charm4py/charm.py b/charm4py/charm.py index 31805ed4..c6b0bf74 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -127,6 +127,7 @@ def __init__(self): self.CkChareSend = self.lib.CkChareSend self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend + self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False From 46afa36b96188c88251ce4ff66f2482ca504d36e Mon Sep 17 00:00:00 2001 From: Maya Taylor Date: Tue, 29 Apr 2025 15:11:03 -0500 Subject: [PATCH 09/14] flag for cuda build --- charm4py/charmlib/charmlib_cython.pyx | 2 ++ setup.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 9f9645f8..56b91390 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -868,6 +868,8 @@ class CharmLib(object): CcsSendReply(replyLen, replyData) def hapiAddCudaCallback(self, stream, future): + if not HAVE_CUDA_BUILD: + raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support") id = future.fid CkHapiAddCallback( stream, depositFutureWithId, id) diff --git a/setup.py b/setup.py index 6b351907..e53bafd9 100644 --- a/setup.py +++ b/setup.py @@ -325,6 +325,10 @@ def install(self): cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"] else: cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"] + + cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('CUDA') != -1 + if (cudaBuild): + print("CUDA build detected") extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython', sources=['charm4py/charmlib/charmlib_cython.pyx'], @@ -333,7 +337,8 @@ def install(self): libraries=["charm"], extra_compile_args=[], extra_link_args=extra_link_args, - ), compile_time_env={'HAVE_NUMPY': haveNumpy})) + ), compile_time_env={'HAVE_NUMPY': haveNumpy, + 'HAVE_CUDA_BUILD': cudaBuild})) extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store', sources=['charm4py/c_object_store.pyx'], @@ -342,7 +347,8 @@ def install(self): libraries=["charm"], extra_compile_args=[], extra_link_args=cobject_extra_args, - ), compile_time_env={'HAVE_NUMPY': haveNumpy})) + ), compile_time_env={'HAVE_NUMPY': haveNumpy, + 'HAVE_CUDA_BUILD': cudaBuild})) additional_setup_keywords = {} From 24f908e36a9cc6c430c4914a11ae7799306df8a9 Mon Sep 17 00:00:00 2001 From: Maya Taylor Date: Tue, 29 Apr 2025 15:24:01 -0500 Subject: [PATCH 10/14] add hapi docs --- docs/gpus.rst | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 80 insertions(+) create mode 100644 docs/gpus.rst diff --git a/docs/gpus.rst b/docs/gpus.rst new file mode 100644 index 00000000..a41528af --- /dev/null +++ b/docs/gpus.rst @@ -0,0 +1,79 @@ +==== +GPUs +==== + +.. .. contents:: + + +GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface. +Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures, +using the function ``charm.hapiAddCudaCallback``. + +The HAPI Charm4py API is: + +.. code-block:: python + + def hapiAddCudaCallback(stream, future) + +.. note:: + + For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback + into the stream such that when the callback is reached, the corresponding Charm4py future is set. + + +Examples +-------- + +.. code-block:: python + + from charm4py import charm + import time + import numba.cuda as cuda + import numpy as np + + @cuda.jit + def elementwise_sum_kernel(x_in, x_out): + idx = cuda.grid(1) + if idx < x_in.shape[0]: + x_out[idx] = x_in[idx] + x_in[idx] + + def main(args): + N = 1_000_000 + array_size = (N,) + + s = cuda.stream() + stream_handle = s.handle.value + + A_host = np.arange(N, dtype=np.float32) + + A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s) + A_gpu.copy_to_device(A_host, stream=s) + + threads_per_block = 128 + blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block + + print("Launching kernel and inserting callback...") + start_time = time.perf_counter() + elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu) + + return_fut = charm.Future() + charm.hapiAddCudaCallback(stream_handle, return_fut) + return_fut.get() + kernel_done_time = time.perf_counter() + print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.") + + B_host = B_gpu.copy_to_host(stream=s) + + s.synchronize() + + sum_result = np.sum(B_host) + print(f"Sum of result is {sum_result}") + + charm.exit() + + charm.start(main) + + +The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track +completion of a numba kernel launch. diff --git a/docs/index.rst b/docs/index.rst index 3d5e90a7..328b02c4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,6 +41,7 @@ to the largest supercomputers. sections pool rules + gpus .. toctree:: :maxdepth: 2 From 7af8580a0ee15ae90884b8a0bf32a937a97dbd82 Mon Sep 17 00:00:00 2001 From: Maya Taylor <70495835+mayantaylor@users.noreply.github.com> Date: Wed, 30 Apr 2025 07:47:56 -0500 Subject: [PATCH 11/14] Update docs with build instructions --- docs/gpus.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/gpus.rst b/docs/gpus.rst index a41528af..0cb1ebde 100644 --- a/docs/gpus.rst +++ b/docs/gpus.rst @@ -20,6 +20,18 @@ The HAPI Charm4py API is: For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback into the stream such that when the callback is reached, the corresponding Charm4py future is set. +Enabling HAPI +-------- +To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source: + +.. code-block:: shell + + export CHARM_EXTRA_BUILD_OPTS="cuda" + pip install . + +.. warning:: + + To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install. Examples -------- From 90f8bbac779d57c2629ed3a697220f7a889b8308 Mon Sep 17 00:00:00 2001 From: mtaylo12 Date: Wed, 30 Apr 2025 07:53:41 -0500 Subject: [PATCH 12/14] fix flag --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index e53bafd9..e47d31eb 100644 --- a/setup.py +++ b/setup.py @@ -326,10 +326,8 @@ def install(self): else: cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"] - cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('CUDA') != -1 - if (cudaBuild): - print("CUDA build detected") - + cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1 + extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython', sources=['charm4py/charmlib/charmlib_cython.pyx'], include_dirs=['charm_src/charm/include'] + my_include_dirs, From 2fa6aa30d35a554662913fd42e6da2100231f7f8 Mon Sep 17 00:00:00 2001 From: Ritvik Rao Date: Wed, 30 Apr 2025 08:20:50 -0500 Subject: [PATCH 13/14] add example command to readme --- examples/cuda/hapi/README.md | 10 ++++++++++ examples/cuda/hapi/hapi-cuda-callback.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md index b13dca66..5a249b31 100644 --- a/examples/cuda/hapi/README.md +++ b/examples/cuda/hapi/README.md @@ -15,3 +15,13 @@ Usage - using torch: `stream_handle = torch.cuda.Stream().cuda_stream` - using numba: `stream_handle = numba.cuda.stream().handle.value` - currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams. + +Running example + +- If running locally, use: + +$ python3 -m charmrun.start +p hapi-cuda-callback.py + +- If running on a cluster machine with Slurm, use: + +$ srun -n python3 hapi-cuda-callback.py diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py index 1e2801ce..a7887e52 100644 --- a/examples/cuda/hapi/hapi-cuda-callback.py +++ b/examples/cuda/hapi/hapi-cuda-callback.py @@ -44,4 +44,4 @@ def main(args): charm.exit() -charm.start(main) \ No newline at end of file +charm.start(main) From 1bca9da5c6ab79244e6d8de9bf6ffb4cdbe31fe6 Mon Sep 17 00:00:00 2001 From: Ritvik Rao Date: Wed, 30 Apr 2025 08:22:07 -0500 Subject: [PATCH 14/14] fix readme --- examples/cuda/hapi/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md index 5a249b31..f94f7bd2 100644 --- a/examples/cuda/hapi/README.md +++ b/examples/cuda/hapi/README.md @@ -20,8 +20,8 @@ Running example - If running locally, use: -$ python3 -m charmrun.start +p hapi-cuda-callback.py +`$ python3 -m charmrun.start +p hapi-cuda-callback.py` - If running on a cluster machine with Slurm, use: -$ srun -n python3 hapi-cuda-callback.py +`$ srun -n python3 hapi-cuda-callback.py`