@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
88using PointNeighbors. KernelAbstractions: KernelAbstractions, @kernel , @index
99using PointNeighbors. Adapt: Adapt
1010using Base: @propagate_inbounds
11+ using PointNeighbors. BenchmarkTools
1112
1213const UnifiedCuArray = CuArray{<: Any , <: Any , CUDA. UnifiedMemory}
1314
6768 CUDA. device! (0 )
6869end
6970
70- function atomic_system_add (ptr:: CUDA.LLVMPtr{Int32, CUDA.AS.Global} , val:: Int32 )
71- CUDA. LLVM. Interop. @asmcall (
72- " atom.sys.global.add.u32 \$ 0, [\$ 1], \$ 2;" ,
73- " =r,l,r,~{memory}" ,
74- true , Int32, Tuple{CUDA. LLVMPtr{Int32, CUDA. AS. Global}, Int32},
75- ptr, val
76- )
77- end
71+ # ###########################################################################################
72+ # First approach: Use actual system atomics on unified memory
73+ # ###########################################################################################
74+ # function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
75+ # CUDA.LLVM.Interop.@asmcall(
76+ # "atom.sys.global.add.u32 \$0, [\$1], \$2;",
77+ # "=r,l,r,~{memory}",
78+ # true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
79+ # ptr, val
80+ # )
81+ # end
7882
79- @inline function pushat_atomic_system! (vov, i, value)
80- (; backend, lengths) = vov
83+ # @inline function pushat_atomic_system!(vov, i, value)
84+ # (; backend, lengths) = vov
8185
82- @boundscheck checkbounds (vov, i)
86+ # @boundscheck checkbounds(vov, i)
8387
84- # Increment the column length with an atomic add to avoid race conditions.
85- # Store the new value since it might be changed immediately afterwards by another
86- # thread.
87- # new_length = Atomix.@atomic lengths[i] += 1
88- new_length = atomic_system_add (pointer (lengths, i), Int32 (1 )) + 1
88+ # # Increment the column length with an atomic add to avoid race conditions.
89+ # # Store the new value since it might be changed immediately afterwards by another
90+ # # thread.
91+ # # new_length = Atomix.@atomic lengths[i] += 1
92+ # new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
8993
90- # We can write here without race conditions, since the atomic add guarantees
91- # that `new_length` is different for each thread.
92- backend[new_length, i] = value
94+ # # We can write here without race conditions, since the atomic add guarantees
95+ # # that `new_length` is different for each thread.
96+ # backend[new_length, i] = value
9397
94- return vov
95- end
98+ # return vov
99+ # end
96100
97- const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}} }
101+ # const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
98102
99- function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
100- (; cell_list) = neighborhood_search
103+ # function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
104+ # (; cell_list) = neighborhood_search
101105
102- cell_list. cells. lengths .= 0
106+ # cell_list.cells.lengths .= 0
103107
104- if neighborhood_search. search_radius < eps ()
105- # Cannot initialize with zero search radius.
106- # This is used in TrixiParticles when a neighborhood search is not used.
107- return neighborhood_search
108- end
108+ # if neighborhood_search.search_radius < eps()
109+ # # Cannot initialize with zero search radius.
110+ # # This is used in TrixiParticles when a neighborhood search is not used.
111+ # return neighborhood_search
112+ # end
109113
110- # Faster on a single GPU
111- @threaded CUDABackend () for point in axes (y, 2 )
112- # Get cell index of the point's cell
113- point_coords = PointNeighbors. extract_svector (y, Val (ndims (neighborhood_search)), point)
114- cell = PointNeighbors. cell_coords (point_coords, neighborhood_search)
114+ # # Faster on a single GPU
115+ # # CUDA.NVTX.@range "threaded loop" begin
116+ # @threaded y for point in axes(y, 2)
117+ # # Get cell index of the point's cell
118+ # point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
119+ # cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
115120
116- # Add point to corresponding cell
117- pushat_atomic_system! (cell_list. cells, PointNeighbors. cell_index (cell_list, cell), point)
118- end
121+ # # Add point to corresponding cell
122+ # pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
123+ # end
124+ # # end
119125
120- return neighborhood_search
121- end
126+ # return neighborhood_search
127+ # end
128+
129+ # function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
130+ # y::AbstractMatrix; parallelization_backend = y)
131+ # PointNeighbors.initialize_grid!(neighborhood_search, y)
132+ # end
122133
123- # This might be slightly faster, but causes problems with synchronization in the interaction
124- # kernel because we are carrying around device memory.
134+ # ###########################################################################################
135+ # Second approach: Use a device array to update and copy to unified memory
136+ # ###########################################################################################
125137# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126138# vov_unified :: VU
127139# vov_device :: VD
133145
134146# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135147
136- # function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137- # @info "" to
138- # return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
148+ # function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
149+ # return Adapt.adapt(to, vov.vov_unified)
139150# end
140151
141152# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
182193
183194# # The atomic version of `pushat!` uses atomics to avoid race conditions when
184195# # `pushat!` is used in a parallel loop.
185- # @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
196+ # @inbounds PointNeighbors. pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186197# end
187198
188199# # Copy vector of vectors to unified memory
@@ -203,4 +214,188 @@ end
203214# PointNeighbors.initialize_grid!(neighborhood_search, y)
204215# end
205216
217+ # ###########################################################################################
218+ # Third approach: Like second approach, but a device array for each GPU.
219+
220+ # This should have the advantage that each GPU updates the part of the grid that should
221+ # already be in its memory and we can avoid moving everything to the first GPU.
222+ # ###########################################################################################
223+ KernelAbstractions. @kernel function generic_kernel2 (f, gpu)
224+ i = KernelAbstractions. @index (Global)
225+ @inline f (i, gpu)
226+ end
227+
228+ @inline function parallel_foreach2 (f:: T , iterator, vov_per_gpu) where {T}
229+ # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
230+ # and index with `iterator[eachindex(iterator)[i]]`.
231+ # Note that this only works with vector-like iterators that support arbitrary indexing.
232+ indices = eachindex (iterator)
233+
234+ # Skip empty loops
235+ length (indices) == 0 && return
236+
237+ # Partition `ndrange` to the GPUs
238+ n_gpus = length (CUDA. devices ())
239+ indices_split = Iterators. partition (indices, ceil (Int, length (indices) / n_gpus))
240+ @assert length (indices_split) <= n_gpus
241+
242+ backend = CUDABackend ()
243+
244+ # Synchronize each device
245+ for i in 1 : n_gpus
246+ CUDA. device! (i - 1 )
247+ KernelAbstractions. synchronize (backend)
248+ end
249+
250+ # Launch kernel on each device
251+ for (i, indices_) in enumerate (indices_split)
252+ # Select the correct device for this partition
253+ CUDA. device! (i - 1 )
254+
255+ # Call the generic kernel, which only calls a function with the global GPU index
256+ generic_kernel2 (backend)(vov_per_gpu[i], ndrange = length (indices_)) do j, vov_per_gpu
257+ @inbounds @inline f (iterator[indices_[j]], vov_per_gpu)
258+ end
259+ end
260+
261+ # Synchronize each device
262+ for i in 1 : n_gpus
263+ CUDA. device! (i - 1 )
264+ KernelAbstractions. synchronize (backend)
265+ end
266+
267+ # Select first device again
268+ CUDA. device! (0 )
269+ end
270+
271+ struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
272+ vov_unified:: VOV
273+ vov_per_gpu:: V
274+ end
275+
276+ function MultiGPUVectorOfVectors (vov_unified, vov_per_gpu)
277+ MultiGPUVectorOfVectors {eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)} (vov_unified, vov_per_gpu)
278+ end
279+
280+ # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
281+
282+ function Adapt. adapt_structure (to:: CUDA.KernelAdaptor , vov:: MultiGPUVectorOfVectors )
283+ return Adapt. adapt (to, vov. vov_unified)
284+ end
285+
286+ function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , vov:: DynamicVectorOfVectors{T} ) where {T}
287+ max_inner_length, max_outer_length = size (vov. backend)
288+
289+ n_gpus = length (CUDA. devices ())
290+ vov_per_gpu = [DynamicVectorOfVectors {T} (; max_outer_length, max_inner_length) for _ in 1 : n_gpus]
291+
292+ vov_unified = DynamicVectorOfVectors (Adapt. adapt (to, vov. backend), Adapt. adapt (to, vov. length_), Adapt. adapt (to, vov. lengths))
293+ vov_per_gpu_ = ntuple (i -> Adapt. adapt (CuArray, vov_per_gpu[i]), n_gpus)
294+ for vov__ in vov_per_gpu_
295+ vov__. length_[] = vov. length_[]
296+ end
297+
298+ return MultiGPUVectorOfVectors {T, typeof(vov_unified), typeof(vov_per_gpu_)} (vov_unified, vov_per_gpu_)
299+ end
300+
301+ const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:MultiGPUVectorOfVectors} }
302+
303+ function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
304+ (; cell_list) = neighborhood_search
305+ (; cells) = cell_list
306+ (; vov_unified, vov_per_gpu) = cells
307+
308+ for vov_ in vov_per_gpu
309+ vov_. lengths .= 0
310+ end
311+
312+ if neighborhood_search. search_radius < eps ()
313+ # Cannot initialize with zero search radius.
314+ # This is used in TrixiParticles when a neighborhood search is not used.
315+ return neighborhood_search
316+ end
317+
318+ # Fill cell lists per GPU
319+ parallel_foreach2 (axes (y, 2 ), vov_per_gpu) do point, vov
320+ # Get cell index of the point's cell
321+ point_coords = extract_svector (y, Val (ndims (neighborhood_search)), point)
322+ cell = cell_coords (point_coords, neighborhood_search)
323+
324+ # Add point to corresponding cell
325+ @boundscheck PointNeighbors. check_cell_bounds (cell_list, cell)
326+
327+ # The atomic version of `pushat!` uses atomics to avoid race conditions when
328+ # `pushat!` is used in a parallel loop.
329+ @inbounds PointNeighbors. pushat_atomic! (vov, cell_index (cell_list, cell), point)
330+ end
331+
332+ lengths = ntuple (gpu -> vov_per_gpu[gpu]. lengths, length (vov_per_gpu))
333+ CUDA. synchronize ()
334+ offsets_ = cu (cumsum (lengths), unified = true )
335+ CUDA. synchronize ()
336+ vov_unified. lengths .= offsets_[end ]
337+ CUDA. synchronize ()
338+ offsets = offsets_ .- lengths
339+ nonempty = ntuple (gpu -> findall (x -> x > 0 , lengths[gpu]), length (vov_per_gpu))
340+
341+ n_gpus = length (CUDA. devices ())
342+ backend = CUDABackend ()
343+
344+ # Synchronize each device
345+ for i in 1 : n_gpus
346+ CUDA. device! (i - 1 )
347+ KernelAbstractions. synchronize (backend)
348+ end
349+
350+ # Launch kernel on each device
351+ for (gpu, indices_) in enumerate (nonempty)
352+ # Select the correct device for this partition
353+ CUDA. device! (gpu - 1 )
354+
355+ # Call the generic kernel, which only calls a function with the global GPU index
356+ generic_kernel2 (backend)((vov_per_gpu[gpu], offsets[gpu]), ndrange = length (indices_)) do j, (vov, offset_gpu)
357+ cell = @inbounds indices_[j]
358+ offset = offset_gpu[cell]
359+ # offset = offsets[gpu][cell]
360+
361+ points = vov[cell]
362+ # points = vov_per_gpu[gpu][cell]
363+ for i in eachindex (points)
364+ vov_unified. backend[offset + i, cell] = points[i]
365+ end
366+ end
367+ end
368+
369+ # Synchronize each device
370+ for i in 1 : n_gpus
371+ CUDA. device! (i - 1 )
372+ KernelAbstractions. synchronize (backend)
373+ end
374+
375+ # Select first device again
376+ CUDA. device! (0 )
377+
378+ # # Merge cell lists
379+ # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
380+ # offset = offsets[gpu][cell]
381+
382+ # points = vov_per_gpu[gpu][cell]
383+ # for i in eachindex(points)
384+ # vov_unified.backend[offset + i, cell] = points[i]
385+ # end
386+ # end
387+
388+ return neighborhood_search
389+ end
390+
391+ function PointNeighbors. update! (neighborhood_search:: MultiGPUNeighborhoodSearch ,
392+ x:: AbstractMatrix , y:: AbstractMatrix ;
393+ points_moving = (true , true ), parallelization_backend = x)
394+ # The coordinates of the first set of points are irrelevant for this NHS.
395+ # Only update when the second set is moving.
396+ points_moving[2 ] || return neighborhood_search
397+
398+ PointNeighbors. initialize_grid! (neighborhood_search, y)
399+ end
400+
206401end # module
0 commit comments