From c1bef50acf22652311d326a83c846742722834b0 Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Thu, 3 Oct 2024 14:28:27 -0500
Subject: [PATCH 01/14] ctypes and cython streams working

---
 charm4py/charmlib/ccharm.pxd          | 2 +-
 charm4py/charmlib/charmlib_ctypes.py  | 3 +++
 charm4py/charmlib/charmlib_cython.pyx | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 5bbe1b05..e97e4287 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -69,7 +69,7 @@ cdef extern from "charm.h":
     void CkStartQDExt_ArrayCallback(int aid, int* idx, int ndims, int epIdx, int fid);
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
-
+    void CkHapiAddCallback(long stream, void *cb, void *msg);
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
diff --git a/charm4py/charmlib/charmlib_ctypes.py b/charm4py/charmlib/charmlib_ctypes.py
index 72f8fadf..922cf85e 100644
--- a/charm4py/charmlib/charmlib_ctypes.py
+++ b/charm4py/charmlib/charmlib_ctypes.py
@@ -729,6 +729,9 @@ def CkGetPesOnPhysicalNode(self, node):
 
   def scheduleTagAfter(self, tag, msecs):
     self.lib.CcdCallFnAfter(self.CcdCallFnAfterCallback_cb, tag, c_double(msecs))
+    
+  def hapiAddCallback(self, stream, callback):
+    self.lib.CkHapiAddCallback(stream, callback, None)
 
   def CcdCallFnAfterCallback(self, userParam, curWallTime):
     try:
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index e41bc5da..97f292fa 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -826,6 +826,9 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
+  def hapiAddCallback(self, stream, callback):
+    CkHapiAddCallback(<long> stream, <void*> callback, <void*> None)
+
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:

From 728457c202cd2c1ca2e031826fc19a02621e693a Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Thu, 3 Oct 2024 17:48:46 -0500
Subject: [PATCH 02/14] cython prelim implementation working!

---
 charm4py/charmlib/ccharm.pxd          | 3 ++-
 charm4py/charmlib/charmlib_cython.pyx | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index e97e4287..e92f3ef3 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -69,7 +69,8 @@ cdef extern from "charm.h":
     void CkStartQDExt_ArrayCallback(int aid, int* idx, int ndims, int epIdx, int fid);
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
-    void CkHapiAddCallback(long stream, void *cb, void *msg);
+
+    void CkHapiAddCallback(long stream, void (*cb)(void*, void*), void *msg);
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 97f292fa..0091336a 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -826,9 +826,12 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
-  def hapiAddCallback(self, stream, callback):
-    CkHapiAddCallback(<long> stream, <void*> callback, <void*> None)
+  def hapiAddCallback(self, stream, fn):
+    CkHapiAddCallback(<long> stream, testHapiCallback, <void*> fn)
 
+cdef void testHapiCallback(void *f, void* message) noexcept:
+  print("testing hapi callback")
+  (<object>f)()
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:

From e2f505c43422cd96f37c6bc40b9abd84916005c0 Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Fri, 4 Oct 2024 17:01:52 -0500
Subject: [PATCH 03/14] change to support future

---
 charm4py/charmlib/ccharm.pxd          |  2 +-
 charm4py/charmlib/charmlib_cython.pyx | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index e92f3ef3..1fd2aaea 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -70,7 +70,7 @@ cdef extern from "charm.h":
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
 
-    void CkHapiAddCallback(long stream, void (*cb)(void*, void*), void *msg);
+    void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid);
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 0091336a..c009320b 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -826,12 +826,15 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
-  def hapiAddCallback(self, stream, fn):
-    CkHapiAddCallback(<long> stream, testHapiCallback, <void*> fn)
-
-cdef void testHapiCallback(void *f, void* message) noexcept:
-  print("testing hapi callback")
-  (<object>f)()
+  def hapiAddCallback(self, stream, future):
+    id = future.fid
+    print("adding callback with future fid", id)
+    CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)
+
+cdef void depositFutureWithId(void *param, void* message) noexcept:
+  cdef int futureId = <int> param
+  print("future fid received, depositing: ", futureId)
+  charm._future_deposit_result(futureId)
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:

From fb63125912fe35b5aa6fc1d395059181707a7c0d Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Wed, 9 Oct 2024 09:19:44 -0500
Subject: [PATCH 04/14] hapi works via futures now

---
 charm4py/charm.py                     | 4 ++++
 charm4py/charmlib/charmlib_cython.pyx | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 47a9b1f4..31805ed4 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -934,6 +934,10 @@ def recordSendRecv(self, stats, size):
         stats[2] = max(size, stats[2])
         stats[3] += size
         stats[4] = size
+        
+    # deposit value of one of the futures that was created on this PE
+    def _future_deposit_result(self, fid, result=None):
+        self.threadMgr.depositFuture(fid, result)
 
     def __printTable__(self, table, sep):
         col_width = [max(len(x) for x in col) for col in zip(*table)]
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index c009320b..07a608d9 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -826,15 +826,13 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
-  def hapiAddCallback(self, stream, future):
+  def hapiAddCudaCallback(self, stream, future):
     id = future.fid
-    print("adding callback with future fid", id)
     CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)
 
 cdef void depositFutureWithId(void *param, void* message) noexcept:
   cdef int futureId = <int> param
-  print("future fid received, depositing: ", futureId)
-  charm._future_deposit_result(futureId)
+  charm._future_deposit_result(futureId, 1)
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:

From d031d3288d1ababfa22991044cf1964276d54b77 Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Wed, 30 Oct 2024 10:19:52 -0500
Subject: [PATCH 05/14] cleanup and example

---
 charm4py/charmlib/charmlib_cython.pyx |  2 +-
 examples/cuda/README.md               | 17 +++++++++++++
 examples/cuda/hapi-cuda-callback.py   | 36 +++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 examples/cuda/README.md
 create mode 100644 examples/cuda/hapi-cuda-callback.py

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 07a608d9..6cd9c14c 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -832,7 +832,7 @@ class CharmLib(object):
 
 cdef void depositFutureWithId(void *param, void* message) noexcept:
   cdef int futureId = <int> param
-  charm._future_deposit_result(futureId, 1)
+  charm._future_deposit_result(futureId, None)
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:
diff --git a/examples/cuda/README.md b/examples/cuda/README.md
new file mode 100644
index 00000000..b13dca66
--- /dev/null
+++ b/examples/cuda/README.md
@@ -0,0 +1,17 @@
+## Using Charm4py with CUDA
+
+### HAPI CUDA Callback
+
+Example overview
+
+- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library
+- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures
+- The example is based around a simple torch kernel.
+
+Usage
+
+- hapiAddCudaCallback requires a cuda stream handle and a future
+- access to the Cuda stream handle depends on the Python library being used. For example...
+  - using torch: `stream_handle = torch.cuda.Stream().cuda_stream`
+  - using numba: `stream_handle = numba.cuda.stream().handle.value`
+- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams.
diff --git a/examples/cuda/hapi-cuda-callback.py b/examples/cuda/hapi-cuda-callback.py
new file mode 100644
index 00000000..0deb87d8
--- /dev/null
+++ b/examples/cuda/hapi-cuda-callback.py
@@ -0,0 +1,36 @@
+from charm4py import charm
+import time
+import torch
+
+# using numba requires the following stream handle
+# import numba.cuda as cuda
+# s = cuda.stream()
+# stream_handle = s.handle.value
+
+def main(args):        
+    cuda = torch.device('cuda')
+    s = torch.cuda.Stream()  # Create a new stream.
+    A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0)
+   
+    print("Starting computation and inserting callback")
+    start_time = time.perf_counter()
+    with torch.cuda.stream(s):
+        B = torch.sum(A)
+    
+    # create future to track cuda stream
+    return_fut = charm.Future()
+    stream_handle = s.cuda_stream
+    charm.lib.hapiAddCudaCallback(stream_handle, return_fut) 
+    
+    # other work can be overlapped with kernel here
+    
+    return_fut.get()
+    
+    sum = B.cpu().item()
+    elapsed = time.perf_counter() - start_time
+    print(f"Kernel done in {elapsed} seconds. Sum result is {sum}")
+    
+    charm.exit()
+    
+
+charm.start(main)
\ No newline at end of file

From 39e09ba9963179f4b451d143ad598647e998744a Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 25 Apr 2025 11:32:11 -0500
Subject: [PATCH 06/14] make separate directory

---
 examples/cuda/{ => hapi}/README.md             | 0
 examples/cuda/{ => hapi}/hapi-cuda-callback.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/cuda/{ => hapi}/README.md (100%)
 rename examples/cuda/{ => hapi}/hapi-cuda-callback.py (100%)

diff --git a/examples/cuda/README.md b/examples/cuda/hapi/README.md
similarity index 100%
rename from examples/cuda/README.md
rename to examples/cuda/hapi/README.md
diff --git a/examples/cuda/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py
similarity index 100%
rename from examples/cuda/hapi-cuda-callback.py
rename to examples/cuda/hapi/hapi-cuda-callback.py

From e489a75dcf62006ffcb6041b8271f8872cf84652 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 25 Apr 2025 12:32:21 -0500
Subject: [PATCH 07/14] Make example numba

---
 examples/cuda/hapi/hapi-cuda-callback.py | 67 ++++++++++++++----------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py
index 0deb87d8..1e2801ce 100644
--- a/examples/cuda/hapi/hapi-cuda-callback.py
+++ b/examples/cuda/hapi/hapi-cuda-callback.py
@@ -1,36 +1,47 @@
 from charm4py import charm
 import time
-import torch
-
-# using numba requires the following stream handle
-# import numba.cuda as cuda
-# s = cuda.stream()
-# stream_handle = s.handle.value
-
-def main(args):        
-    cuda = torch.device('cuda')
-    s = torch.cuda.Stream()  # Create a new stream.
-    A = torch.empty((100, 100), device=cuda).normal_(0.0, 1.0)
-   
-    print("Starting computation and inserting callback")
+import numba.cuda as cuda
+import numpy as np
+
+@cuda.jit
+def elementwise_sum_kernel(x_in, x_out):
+    idx = cuda.grid(1)
+    if idx < x_in.shape[0]:
+        x_out[idx] = x_in[idx] + x_in[idx]
+
+def main(args):
+    N = 1_000_000
+    array_size = (N,)
+
+    s = cuda.stream()
+    stream_handle = s.handle.value
+
+    A_host = np.arange(N, dtype=np.float32)
+
+    A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+    B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+    A_gpu.copy_to_device(A_host, stream=s)
+
+    threads_per_block = 128
+    blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
+
+    print("Launching kernel and inserting callback...")
     start_time = time.perf_counter()
-    with torch.cuda.stream(s):
-        B = torch.sum(A)
-    
-    # create future to track cuda stream
+    elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
+
     return_fut = charm.Future()
-    stream_handle = s.cuda_stream
-    charm.lib.hapiAddCudaCallback(stream_handle, return_fut) 
-    
-    # other work can be overlapped with kernel here
-    
+    charm.hapiAddCudaCallback(stream_handle, return_fut)
     return_fut.get()
-    
-    sum = B.cpu().item()
-    elapsed = time.perf_counter() - start_time
-    print(f"Kernel done in {elapsed} seconds. Sum result is {sum}")
-    
+    kernel_done_time = time.perf_counter()
+    print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
+
+    B_host = B_gpu.copy_to_host(stream=s)
+
+    s.synchronize()
+
+    sum_result = np.sum(B_host)
+    print(f"Sum of result is {sum_result}")
+
     charm.exit()
-    
 
 charm.start(main)
\ No newline at end of file

From 2053aa8126d72f4b29a78100e8711b8e60b27c44 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 25 Apr 2025 12:32:31 -0500
Subject: [PATCH 08/14] Set hapiAddCudaCallback to charm

---
 charm4py/charm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 31805ed4..c6b0bf74 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -127,6 +127,7 @@ def __init__(self):
         self.CkChareSend = self.lib.CkChareSend
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
+        self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False

From 46afa36b96188c88251ce4ff66f2482ca504d36e Mon Sep 17 00:00:00 2001
From: Maya Taylor <mayat4@illinois.edu>
Date: Tue, 29 Apr 2025 15:11:03 -0500
Subject: [PATCH 09/14] flag for cuda build

---
 charm4py/charmlib/charmlib_cython.pyx |  2 ++
 setup.py                              | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 9f9645f8..56b91390 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -868,6 +868,8 @@ class CharmLib(object):
     CcsSendReply(replyLen, <const void*>replyData)
 
   def hapiAddCudaCallback(self, stream, future):
+    if not HAVE_CUDA_BUILD:
+      raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support")
     id = future.fid
     CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)
 
diff --git a/setup.py b/setup.py
index 6b351907..e53bafd9 100644
--- a/setup.py
+++ b/setup.py
@@ -325,6 +325,10 @@ def install(self):
             cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"]
         else:
             cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"]
+            
+    cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('CUDA') != -1
+    if (cudaBuild):
+        print("CUDA build detected")
 
     extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython',
                             sources=['charm4py/charmlib/charmlib_cython.pyx'],
@@ -333,7 +337,8 @@ def install(self):
                             libraries=["charm"],
                             extra_compile_args=[],
                             extra_link_args=extra_link_args,
-                            ), compile_time_env={'HAVE_NUMPY': haveNumpy}))
+                            ), compile_time_env={'HAVE_NUMPY': haveNumpy,
+                                                 'HAVE_CUDA_BUILD': cudaBuild}))
 
     extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store',
                             sources=['charm4py/c_object_store.pyx'],
@@ -342,7 +347,8 @@ def install(self):
                             libraries=["charm"],
                             extra_compile_args=[],
                             extra_link_args=cobject_extra_args,
-                            ), compile_time_env={'HAVE_NUMPY': haveNumpy}))
+                            ), compile_time_env={'HAVE_NUMPY': haveNumpy,
+                                                 'HAVE_CUDA_BUILD': cudaBuild}))
 
 
 additional_setup_keywords = {}

From 24f908e36a9cc6c430c4914a11ae7799306df8a9 Mon Sep 17 00:00:00 2001
From: Maya Taylor <mayat4@illinois.edu>
Date: Tue, 29 Apr 2025 15:24:01 -0500
Subject: [PATCH 10/14] add hapi docs

---
 docs/gpus.rst  | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst |  1 +
 2 files changed, 80 insertions(+)
 create mode 100644 docs/gpus.rst

diff --git a/docs/gpus.rst b/docs/gpus.rst
new file mode 100644
index 00000000..a41528af
--- /dev/null
+++ b/docs/gpus.rst
@@ -0,0 +1,79 @@
+====
+GPUs
+====
+
+.. .. contents::
+
+
+GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface.
+Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures, 
+using the function ``charm.hapiAddCudaCallback``.
+
+The HAPI Charm4py API is:
+
+.. code-block:: python
+
+    def hapiAddCudaCallback(stream, future)
+
+.. note::
+
+    For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback 
+    into the stream such that when the callback is reached, the corresponding Charm4py future is set.
+
+
+Examples
+--------
+
+.. code-block:: python
+
+    from charm4py import charm
+    import time
+    import numba.cuda as cuda
+    import numpy as np
+
+    @cuda.jit
+    def elementwise_sum_kernel(x_in, x_out):
+        idx = cuda.grid(1)
+        if idx < x_in.shape[0]:
+            x_out[idx] = x_in[idx] + x_in[idx]
+
+    def main(args):
+        N = 1_000_000
+        array_size = (N,)
+
+        s = cuda.stream()
+        stream_handle = s.handle.value
+
+        A_host = np.arange(N, dtype=np.float32)
+
+        A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+        B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+        A_gpu.copy_to_device(A_host, stream=s)
+
+        threads_per_block = 128
+        blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
+
+        print("Launching kernel and inserting callback...")
+        start_time = time.perf_counter()
+        elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
+
+        return_fut = charm.Future()
+        charm.hapiAddCudaCallback(stream_handle, return_fut)
+        return_fut.get()
+        kernel_done_time = time.perf_counter()
+        print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
+
+        B_host = B_gpu.copy_to_host(stream=s)
+
+        s.synchronize()
+
+        sum_result = np.sum(B_host)
+        print(f"Sum of result is {sum_result}")
+
+        charm.exit()
+
+    charm.start(main)
+
+
+The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track 
+completion of a numba kernel launch.
diff --git a/docs/index.rst b/docs/index.rst
index 3d5e90a7..328b02c4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -41,6 +41,7 @@ to the largest supercomputers.
    sections
    pool
    rules
+   gpus
 
 .. toctree::
    :maxdepth: 2

From 7af8580a0ee15ae90884b8a0bf32a937a97dbd82 Mon Sep 17 00:00:00 2001
From: Maya Taylor <70495835+mayantaylor@users.noreply.github.com>
Date: Wed, 30 Apr 2025 07:47:56 -0500
Subject: [PATCH 11/14] Update docs with build instructions

---
 docs/gpus.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/gpus.rst b/docs/gpus.rst
index a41528af..0cb1ebde 100644
--- a/docs/gpus.rst
+++ b/docs/gpus.rst
@@ -20,6 +20,18 @@ The HAPI Charm4py API is:
     For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback 
     into the stream such that when the callback is reached, the corresponding Charm4py future is set.
 
+Enabling HAPI
+--------
+To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source:
+
+.. code-block:: shell
+
+   export CHARM_EXTRA_BUILD_OPTS="cuda"
+   pip install .
+
+.. warning:: 
+
+    To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install.
 
 Examples
 --------

From 90f8bbac779d57c2629ed3a697220f7a889b8308 Mon Sep 17 00:00:00 2001
From: mtaylo12 <mayat4@illinois.edu>
Date: Wed, 30 Apr 2025 07:53:41 -0500
Subject: [PATCH 12/14] fix flag

---
 setup.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index e53bafd9..e47d31eb 100644
--- a/setup.py
+++ b/setup.py
@@ -326,10 +326,8 @@ def install(self):
         else:
             cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"]
             
-    cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('CUDA') != -1
-    if (cudaBuild):
-        print("CUDA build detected")
-
+    cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1
+    
     extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython',
                             sources=['charm4py/charmlib/charmlib_cython.pyx'],
                             include_dirs=['charm_src/charm/include'] + my_include_dirs,

From 2fa6aa30d35a554662913fd42e6da2100231f7f8 Mon Sep 17 00:00:00 2001
From: Ritvik Rao <rsrao2@illinois.edu>
Date: Wed, 30 Apr 2025 08:20:50 -0500
Subject: [PATCH 13/14] add example command to readme

---
 examples/cuda/hapi/README.md             | 10 ++++++++++
 examples/cuda/hapi/hapi-cuda-callback.py |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md
index b13dca66..5a249b31 100644
--- a/examples/cuda/hapi/README.md
+++ b/examples/cuda/hapi/README.md
@@ -15,3 +15,13 @@ Usage
   - using torch: `stream_handle = torch.cuda.Stream().cuda_stream`
   - using numba: `stream_handle = numba.cuda.stream().handle.value`
 - currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams.
+
+Running example
+
+- If running locally, use:  
+
+$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py  
+
+- If running on a cluster machine with Slurm, use:  
+
+$ srun -n <N> python3 hapi-cuda-callback.py 
diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py
index 1e2801ce..a7887e52 100644
--- a/examples/cuda/hapi/hapi-cuda-callback.py
+++ b/examples/cuda/hapi/hapi-cuda-callback.py
@@ -44,4 +44,4 @@ def main(args):
 
     charm.exit()
 
-charm.start(main)
\ No newline at end of file
+charm.start(main)

From 1bca9da5c6ab79244e6d8de9bf6ffb4cdbe31fe6 Mon Sep 17 00:00:00 2001
From: Ritvik Rao <rsrao2@illinois.edu>
Date: Wed, 30 Apr 2025 08:22:07 -0500
Subject: [PATCH 14/14] fix readme

---
 examples/cuda/hapi/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md
index 5a249b31..f94f7bd2 100644
--- a/examples/cuda/hapi/README.md
+++ b/examples/cuda/hapi/README.md
@@ -20,8 +20,8 @@ Running example
 
 - If running locally, use:  
 
-$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py  
+`$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py`
 
 - If running on a cluster machine with Slurm, use:  
 
-$ srun -n <N> python3 hapi-cuda-callback.py 
+`$ srun -n <N> python3 hapi-cuda-callback.py`