zama-ai
diff --git a/‎tfhe/src/core_crypto/gpu/mod.rs
Lines changed: 24 additions & 1 deletion b/‎tfhe/src/core_crypto/gpu/mod.rs
Lines changed: 24 additions & 1 deletion
diff --git a/‎tfhe/src/high_level_api/array/gpu/booleans.rs
Lines changed: 46 additions & 59 deletions b/‎tfhe/src/high_level_api/array/gpu/booleans.rs
Lines changed: 46 additions & 59 deletions
diff --git a/‎tfhe/src/high_level_api/array/gpu/integers.rs
Lines changed: 14 additions & 17 deletions b/‎tfhe/src/high_level_api/array/gpu/integers.rs
Lines changed: 14 additions & 17 deletions
@@ -17,7 +17,6 @@ pub use entities::*;
 use std::ffi::c_void;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
-
 pub struct CudaStreams {
     pub ptr: Vec<*mut c_void>,
     pub gpu_indexes: Vec<GpuIndex>,
@@ -43,6 +42,22 @@ impl CudaStreams {
             gpu_indexes,
         }
     }
+    /// Create a new `CudaStreams` structure with the GPUs with id provided in a list
+    pub fn new_multi_gpu_with_indexes(indexes: &[GpuIndex]) -> Self {
+        let _gpu_count = setup_multi_gpu();
+
+        let mut gpu_indexes = Vec::with_capacity(indexes.len());
+        let mut ptr_array = Vec::with_capacity(indexes.len());
+
+        for &i in indexes {
+            ptr_array.push(unsafe { cuda_create_stream(i.get()) });
+            gpu_indexes.push(i);
+        }
+        Self {
+            ptr: ptr_array,
+            gpu_indexes,
+        }
+    }
     /// Create a new `CudaStreams` structure with one GPU, whose index corresponds to the one given
     /// as input
     pub fn new_single_gpu(gpu_index: GpuIndex) -> Self {
@@ -88,6 +103,14 @@ impl CudaStreams {
     }
 }
 
+impl Clone for CudaStreams {
+    fn clone(&self) -> Self {
+        // The `new_multi_gpu_with_indexes()` function is used here to adapt to any specific type of
+        // streams being cloned (single, multi, or custom)
+        Self::new_multi_gpu_with_indexes(self.gpu_indexes.as_slice())
+    }
+}
+
 impl Drop for CudaStreams {
     fn drop(&mut self) {
         for (i, &s) in self.ptr.iter().enumerate() {
 
@@ -156,14 +156,13 @@ impl BitwiseArrayBackend for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, Self::Slice<'a>>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(cuda_key.pbs_key().bitand(&lhs.0, &rhs.0, streams))
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().bitand(&lhs.0, &rhs.0, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 
@@ -172,14 +171,13 @@ impl BitwiseArrayBackend for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, Self::Slice<'a>>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(cuda_key.pbs_key().bitor(&lhs.0, &rhs.0, streams))
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().bitor(&lhs.0, &rhs.0, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 
@@ -188,24 +186,22 @@ impl BitwiseArrayBackend for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, Self::Slice<'a>>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(cuda_key.pbs_key().bitxor(&lhs.0, &rhs.0, streams))
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().bitxor(&lhs.0, &rhs.0, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 
     fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .map(|lhs| CudaBooleanBlock(cuda_key.pbs_key().bitnot(&lhs.0, streams)))
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .map(|lhs| CudaBooleanBlock(cuda_key.pbs_key().bitnot(&lhs.0, streams)))
+                .collect::<Vec<_>>()
         }))
     }
 }
@@ -216,16 +212,13 @@ impl ClearBitwiseArrayBackend<bool> for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, &'_ [bool]>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter().copied())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(
-                            cuda_key.pbs_key().scalar_bitand(&lhs.0, rhs as u8, streams),
-                        )
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter().copied())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().scalar_bitand(&lhs.0, rhs as u8, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 
@@ -234,16 +227,13 @@ impl ClearBitwiseArrayBackend<bool> for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, &'_ [bool]>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter().copied())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(
-                            cuda_key.pbs_key().scalar_bitor(&lhs.0, rhs as u8, streams),
-                        )
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter().copied())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().scalar_bitor(&lhs.0, rhs as u8, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 
@@ -252,16 +242,13 @@ impl ClearBitwiseArrayBackend<bool> for GpuFheBoolArrayBackend {
         rhs: TensorSlice<'_, &'_ [bool]>,
     ) -> Self::Owned {
         GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .zip(rhs.par_iter().copied())
-                    .map(|(lhs, rhs)| {
-                        CudaBooleanBlock(
-                            cuda_key.pbs_key().scalar_bitxor(&lhs.0, rhs as u8, streams),
-                        )
-                    })
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .zip(rhs.par_iter().copied())
+                .map(|(lhs, rhs)| {
+                    CudaBooleanBlock(cuda_key.pbs_key().scalar_bitxor(&lhs.0, rhs as u8, streams))
+                })
+                .collect::<Vec<_>>()
         }))
     }
 }
 
@@ -108,12 +108,11 @@ where
     F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, &T, &CudaStreams) -> T,
 {
     GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-        with_thread_local_cuda_streams(|streams| {
-            lhs.par_iter()
-                .zip(rhs.par_iter())
-                .map(|(lhs, rhs)| op(cuda_key.pbs_key(), lhs, rhs, streams))
-                .collect::<Vec<_>>()
-        })
+        let streams = &cuda_key.streams;
+        lhs.par_iter()
+            .zip(rhs.par_iter())
+            .map(|(lhs, rhs)| op(cuda_key.pbs_key(), lhs, rhs, streams))
+            .collect::<Vec<_>>()
     }))
 }
 
@@ -170,12 +169,11 @@ where
     F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, Clear, &CudaStreams) -> T,
 {
     GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-        with_thread_local_cuda_streams(|streams| {
-            lhs.par_iter()
-                .zip(rhs.par_iter())
-                .map(|(lhs, rhs)| op(cuda_key.pbs_key(), lhs, *rhs, streams))
-                .collect::<Vec<_>>()
-        })
+        let streams = &cuda_key.streams;
+        lhs.par_iter()
+            .zip(rhs.par_iter())
+            .map(|(lhs, rhs)| op(cuda_key.pbs_key(), lhs, *rhs, streams))
+            .collect::<Vec<_>>()
     }))
 }
 
@@ -336,11 +334,10 @@ where
 
     fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned {
         GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
-            with_thread_local_cuda_streams(|streams| {
-                lhs.par_iter()
-                    .map(|lhs| cuda_key.pbs_key().bitnot(lhs, streams))
-                    .collect::<Vec<_>>()
-            })
+            let streams = &cuda_key.streams;
+            lhs.par_iter()
+                .map(|lhs| cuda_key.pbs_key().bitnot(lhs, streams))
+                .collect::<Vec<_>>()
         }))
     }
 }