Add prototype for multi-gpu

efaulhaber · efaulhaber · commit 98d83c1566de · 2024-12-23T09:38:10.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.4.6-dev"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -16,6 +17,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 [compat]
 Adapt = "3, 4"
 Atomix = "0.1, 1"
+CUDA = "5.4.3"
 GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9"
 LinearAlgebra = "1"
diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl
@@ -10,6 +10,7 @@ using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synch
 using LinearAlgebra: dot
 using Polyester: Polyester
 @reexport using StaticArrays: SVector
+using CUDA
 
 include("util.jl")
 include("vector_of_vectors.jl")
diff --git a/src/gpu.jl b/src/gpu.jl
@@ -10,6 +10,11 @@
 Adapt.@adapt_structure FullGridCellList
 Adapt.@adapt_structure DynamicVectorOfVectors
 
+# TODO quick and dirty method to make all `CuArray`s unified memory
+function Adapt.adapt_structure(to::typeof(CuArray), array::Array)
+    return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)
+end
+
 # `adapt(CuArray, ::SVector)::SVector`, but `adapt(Array, ::SVector)::Vector`.
 # We don't want to change the type of the `SVector` here.
 function Adapt.adapt_structure(to::typeof(Array), svector::SVector)
diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
@@ -412,10 +412,22 @@ end
     # max_particles_per_cell = maximum(lengths)
     nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
     ndrange = max_particles_per_cell * length(nonempty_cells)
+
+    n_gpus = length(CUDA.devices())
+    ndrange_local = [div(ndrange, n_gpus) for _ in 1:n_gpus]
+    ndrange_local[end] += ndrange % n_gpus
+
     kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,))
-    kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange)
+    @sync for i in 1:n_gpus
+        Threads.@spawn begin
+            CUDA.device!(i - 1)
+            kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange = ndrange_local[i])
+            KernelAbstractions.synchronize(backend)
+        end
+    end
+    # kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange)
 
-    KernelAbstractions.synchronize(backend)
+    # KernelAbstractions.synchronize(backend)
 
     return nothing
 end
diff --git a/src/util.jl b/src/util.jl
@@ -144,11 +144,23 @@ end
 
     # Call the generic kernel that is defined below, which only calls a function with
     # the global GPU index.
-    generic_kernel(backend)(ndrange = ndrange) do i
-        @inbounds @inline f(iterator[indices[i]])
+    n_gpus = length(CUDA.devices())
+    ndrange_local = [div(ndrange, n_gpus) for _ in 1:n_gpus]
+    ndrange_local[end] += ndrange % n_gpus
+
+    @sync for i in 1:n_gpus
+        Threads.@spawn begin
+            CUDA.device!(i - 1)
+            generic_kernel(backend)(ndrange = ndrange_local[i]) do j
+                @inbounds @inline f(iterator[indices[j]])
+            end
+            KernelAbstractions.synchronize(backend)
+        end
     end
-
-    KernelAbstractions.synchronize(backend)
+    # generic_kernel(backend)(ndrange = ndrange) do i
+    #     @inbounds @inline f(iterator[indices[i]])
+    # end
+    # KernelAbstractions.synchronize(backend)
 end
 
 @kernel function generic_kernel(f)