Skip to content

Commit 3051f9a

Browse files
committed
cudev: Add _shfl_down implementation for long long and unsigned long long for devices of compute capability less than 7.0
1 parent 28e6ce5 commit 3051f9a

File tree

1 file changed

+17
-1
lines changed

1 file changed

+17
-1
lines changed

modules/cudev/include/opencv2/cudev/warp/shuffle.hpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,12 +334,28 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp
334334

335335
__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize)
336336
{
337+
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ < 700
338+
union { long long ll; int2 i2; } u;
339+
u.ll = val;
340+
u.i2.x = __shfl_down(u.i2.x, delta, width);
341+
u.i2.y = __shfl_down(u.i2.y, delta, width);
342+
return u.ll;
343+
#else
337344
return __shfl_down(val, delta, width);
345+
#endif
338346
}
339347

340348
__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize)
341349
{
342-
return (unsigned long long) __shfl_down(val, delta, width);
350+
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ < 700
351+
union { unsigned long long ull; uint2 u2; } u;
352+
u.ull = val;
353+
u.u2.x = __shfl_down(static_cast<int>(u.u2.x), delta, width);
354+
u.u2.y = __shfl_down(static_cast<int>(u.u2.y), delta, width);
355+
return u.ull;
356+
#else
357+
return __shfl_down(val, delta, width);
358+
#endif
343359
}
344360

345361
__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)

0 commit comments

Comments
 (0)