From e15607471997e8df6cc34ae286a2c1ec9accf3ed Mon Sep 17 00:00:00 2001 From: Misha Chornyi Date: Fri, 15 Aug 2025 17:38:21 -0700 Subject: [PATCH 1/2] Change API introduced in "cuda-python", doc: https://nvidia.github.io/cuda-python/cuda-bindings/latest/api.html --- .../utils/cuda_shared_memory/__init__.py | 80 ++++++++++--------- .../utils/cuda_shared_memory/_utils.py | 26 +++--- 2 files changed, 57 insertions(+), 49 deletions(-) diff --git a/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py b/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py index 3af88a0fd..7f3fb729f 100755 --- a/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py +++ b/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py @@ -28,6 +28,7 @@ # Check for dependency before other import so other imports can assume # the module is available (drop "try ... except .."") try: + import cuda.bindings.driver as cuda_driver import cuda.bindings.runtime as cudart except ModuleNotFoundError as error: raise RuntimeError( @@ -73,15 +74,15 @@ def _get_or_create_global_cuda_stream(device_id): def _support_uva(shm_device_id, ext_device_id): try: support_uva = call_cuda_function( - cudart.cudaDeviceGetAttribute, - cudart.cudaDeviceAttr.cudaDevAttrUnifiedAddressing, - shm_device_id, + cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrUnifiedAddressing, shm_device_id + ) ) if (support_uva != 0) and (ext_device_id != -1): support_uva = call_cuda_function( - cudart.cudaDeviceGetAttribute, - cudart.cudaDeviceAttr.cudaDevAttrUnifiedAddressing, - ext_device_id, + cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrUnifiedAddressing, ext_device_id + ) ) if support_uva == 0: raise CudaSharedMemoryException( @@ -127,10 +128,11 @@ def create_shared_memory_region(triton_shm_name, byte_size, device_id): """ prev_device = None try: - prev_device = call_cuda_function(cudart.cudaGetDevice) - call_cuda_function(cudart.cudaSetDevice, device_id) - device_ptr = call_cuda_function(cudart.cudaMalloc, byte_size) - cuda_shm_handle = call_cuda_function(cudart.cudaIpcGetMemHandle, device_ptr) + cuda_driver.cuInit(device_id) + prev_device = call_cuda_function(cudart.cudaGetDevice()) + call_cuda_function(cudart.cudaSetDevice(device_id)) + device_ptr = call_cuda_function(cudart.cudaMalloc(byte_size)) + cuda_shm_handle = call_cuda_function(cudart.cudaIpcGetMemHandle(device_ptr)) triton_shm_handle = CudaSharedMemoryRegion( triton_shm_name, cuda_shm_handle, device_ptr, byte_size, device_id ) @@ -210,25 +212,27 @@ def set_shared_memory_region(cuda_shm_handle, input_values): input_value = input_value.item() byte_size = np.dtype(np.byte).itemsize * len(input_value) call_cuda_function( - cudart.cudaMemcpyAsync, - cuda_shm_handle._base_addr + offset_current, - input_value, - byte_size, - cudart.cudaMemcpyKind.cudaMemcpyDefault, - stream, + cudart.cudaMemcpyAsync( + cuda_shm_handle._base_addr + offset_current, + input_value, + byte_size, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream, + ) ) else: byte_size = input_value.size * input_value.itemsize call_cuda_function( - cudart.cudaMemcpyAsync, - cuda_shm_handle._base_addr + offset_current, - input_value.ctypes.data, - byte_size, - cudart.cudaMemcpyKind.cudaMemcpyDefault, - stream, + cudart.cudaMemcpyAsync( + cuda_shm_handle._base_addr + offset_current, + input_value.ctypes.data, + byte_size, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream, + ) ) offset_current += byte_size - call_cuda_function(cudart.cudaStreamSynchronize, stream) + call_cuda_function(cudart.cudaStreamSynchronize(stream)) except Exception as ex: if not isinstance(ex, CudaSharedMemoryException): raise CudaSharedMemoryException( @@ -265,15 +269,16 @@ def get_contents_as_numpy(cuda_shm_handle, datatype, shape): # Numpy can only read from host buffer. host_buffer = (ctypes.c_char * cuda_shm_handle._byte_size)() call_cuda_function( - cudart.cudaMemcpyAsync, - host_buffer, - cuda_shm_handle._base_addr, - cuda_shm_handle._byte_size, - cudart.cudaMemcpyKind.cudaMemcpyDefault, - stream, + cudart.cudaMemcpyAsync( + host_buffer, + cuda_shm_handle._base_addr, + cuda_shm_handle._byte_size, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream, + ) ) # Sync to ensure the host buffer is ready - call_cuda_function(cudart.cudaStreamSynchronize, stream) + call_cuda_function(cudart.cudaStreamSynchronize(stream)) except Exception as ex: if not isinstance(ex, CudaSharedMemoryException): raise CudaSharedMemoryException( @@ -368,14 +373,15 @@ def set_shared_memory_region_from_dlpack(cuda_shm_handle, input_values): try: call_cuda_function( - cudart.cudaMemcpyAsync, - cuda_shm_handle._base_addr + offset_current, - data_ptr, - byte_size, - cudart.cudaMemcpyKind.cudaMemcpyDefault, - stream, + cudart.cudaMemcpyAsync( + cuda_shm_handle._base_addr + offset_current, + data_ptr, + byte_size, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream, + ) ) - call_cuda_function(cudart.cudaStreamSynchronize, stream) + call_cuda_function(cudart.cudaStreamSynchronize(stream)) except Exception as ex: if not isinstance(ex, CudaSharedMemoryException): raise CudaSharedMemoryException( diff --git a/src/python/library/tritonclient/utils/cuda_shared_memory/_utils.py b/src/python/library/tritonclient/utils/cuda_shared_memory/_utils.py index 30d3f2bc0..f3acf74fd 100644 --- a/src/python/library/tritonclient/utils/cuda_shared_memory/_utils.py +++ b/src/python/library/tritonclient/utils/cuda_shared_memory/_utils.py @@ -30,8 +30,7 @@ import cuda.bindings.runtime as cudart -def call_cuda_function(function, *argv): - res = function(*argv) +def call_cuda_function(res): err = res[0] if isinstance(err, cudart.cudaError_t): if err != cudart.cudaError_t.cudaSuccess: @@ -92,9 +91,10 @@ def __del__(self): return prev_device = None try: - prev_device = call_cuda_function(cudart.cudaGetDevice) - call_cuda_function(cudart.cudaSetDevice, self._device_id) - call_cuda_function(cudart.cudaFree, self._base_addr) + cuda_driver.cuInit(self._device_id) + prev_device = call_cuda_function(cudart.cudaGetDevice()) + call_cuda_function(cudart.cudaSetDevice(self._device_id)) + call_cuda_function(cudart.cudaFree(self._base_addr)) finally: if prev_device is not None: maybe_set_device(prev_device) @@ -104,9 +104,10 @@ class CudaStream: def __init__(self, device_id): prev_device = None try: - prev_device = call_cuda_function(cudart.cudaGetDevice) - call_cuda_function(cudart.cudaSetDevice, device_id) - self._stream = call_cuda_function(cudart.cudaStreamCreate) + cuda_driver.cuInit(device_id) + prev_device = call_cuda_function(cudart.cudaGetDevice()) + call_cuda_function(cudart.cudaSetDevice(device_id)) + self._stream = call_cuda_function(cudart.cudaStreamCreate()) finally: if prev_device is not None: maybe_set_device(prev_device) @@ -117,12 +118,13 @@ def __del__(self): if not hasattr(self, "_stream") or self._stream is None: return # [FIXME] __del__ is not the best place for releasing resources - call_cuda_function(cudart.cudaStreamDestroy, self._stream) + call_cuda_function(cudart.cudaStreamDestroy(self._stream)) self._stream = None def maybe_set_device(device_id): - device = call_cuda_function(cuda_driver.cuDeviceGet, device_id) - _, active = call_cuda_function(cuda_driver.cuDevicePrimaryCtxGetState, device) + cuda_driver.cuInit(device_id) + call_cuda_function(cuda_driver.cuDeviceGet(device_id)) + _, active = call_cuda_function(cuda_driver.cuDevicePrimaryCtxGetState(device_id)) if active: - call_cuda_function(cudart.cudaSetDevice, device_id) + call_cuda_function(cudart.cudaSetDevice(device_id)) From f93d6253d8a1fe659b35010504131df16f0fc3b2 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:27:25 -0700 Subject: [PATCH 2/2] Update src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- .../library/tritonclient/utils/cuda_shared_memory/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py b/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py index 7f3fb729f..153c7ff86 100755 --- a/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py +++ b/src/python/library/tritonclient/utils/cuda_shared_memory/__init__.py @@ -128,7 +128,7 @@ def create_shared_memory_region(triton_shm_name, byte_size, device_id): """ prev_device = None try: - cuda_driver.cuInit(device_id) + call_cuda_function(cuda_driver.cuInit(device_id)) prev_device = call_cuda_function(cudart.cudaGetDevice()) call_cuda_function(cudart.cudaSetDevice(device_id)) device_ptr = call_cuda_function(cudart.cudaMalloc(byte_size))