From 3cebc738b6fd533056fd469eeb56034bffd2e891 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 21 Dec 2017 23:52:09 +0000 Subject: [PATCH] [CUDA] More fixes for __shfl_* intrinsics. * __shfl_{up,down}* uses unsigned int for the third parameter. * added [unsigned] long overloads for non-sync shuffles. Differential Revision: https://reviews.llvm.org/D41521 llvm-svn: 321326 --- clang/lib/Headers/__clang_cuda_intrinsics.h | 77 +++++++++++++-------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index 02d68a2e618e..1794eb3dc1d6 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -34,23 +34,24 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #pragma push_macro("__MAKE_SHUFFLES") -#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ - inline __device__ int __FnName(int __val, int __offset, \ +#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ + __Type) \ + inline __device__ int __FnName(int __val, __Type __offset, \ int __width = warpSize) { \ return __IntIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ float __FnName(float __val, int __offset, \ + inline __device__ float __FnName(float __val, __Type __offset, \ int __width = warpSize) { \ return __FloatIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \ + inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast( \ ::__FnName(static_cast(__val), __offset, __width)); \ } \ - inline __device__ long long __FnName(long long __val, int __offset, \ + inline __device__ long long __FnName(long long __val, __Type __offset, \ int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ @@ -65,12 +66,29 @@ memcpy(&__ret, &__tmp, sizeof(__tmp)); \ return __ret; \ } \ + inline __device__ long __FnName(long __val, __Type __offset, \ + int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned long __val, __Type __offset, int __width = warpSize) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } \ inline __device__ unsigned long long __FnName( \ - unsigned long long __val, int __offset, int __width = warpSize) { \ + unsigned long long __val, __Type __offset, int __width = warpSize) { \ return static_cast(::__FnName( \ static_cast(__val), __offset, __width)); \ } \ - inline __device__ double __FnName(double __val, int __offset, \ + inline __device__ double __FnName(double __val, __Type __offset, \ int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ @@ -81,13 +99,15 @@ return __ret; \ } -__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f); +__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. -__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0); -__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f); -__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); - +__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, + unsigned int); +__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, + unsigned int); +__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, + int); #pragma pop_macro("__MAKE_SHUFFLES") #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 @@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); // __shfl_sync_* variants available in CUDA-9 #pragma push_macro("__MAKE_SYNC_SHUFFLES") #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ - __Mask) \ - inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ - int __width = warpSize) { \ + __Mask, __Type) \ + inline __device__ int __FnName(unsigned int __mask, int __val, \ + __Type __offset, int __width = warpSize) { \ return __IntIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ inline __device__ float __FnName(unsigned int __mask, float __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, int __width = warpSize) { \ return __FloatIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ inline __device__ unsigned int __FnName(unsigned int __mask, \ - unsigned int __val, int __offset, \ + unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast( \ ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ inline __device__ long long __FnName(unsigned int __mask, long long __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, \ + int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ }; \ @@ -130,13 +151,13 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); return __ret; \ } \ inline __device__ unsigned long long __FnName( \ - unsigned int __mask, unsigned long long __val, int __offset, \ + unsigned int __mask, unsigned long long __val, __Type __offset, \ int __width = warpSize) { \ return static_cast(::__FnName( \ __mask, static_cast(__val), __offset, __width)); \ } \ inline __device__ long __FnName(unsigned int __mask, long __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, int __width = warpSize) { \ _Static_assert(sizeof(long) == sizeof(long long) || \ sizeof(long) == sizeof(int)); \ if (sizeof(long) == sizeof(long long)) { \ @@ -147,14 +168,14 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ } \ - inline __device__ unsigned long __FnName(unsigned int __mask, \ - unsigned long __val, int __offset, \ - int __width = warpSize) { \ + inline __device__ unsigned long __FnName( \ + unsigned int __mask, unsigned long __val, __Type __offset, \ + int __width = warpSize) { \ return static_cast( \ ::__FnName(__mask, static_cast(__val), __offset, __width)); \ } \ inline __device__ double __FnName(unsigned int __mask, double __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ memcpy(&__tmp, &__val, sizeof(__val)); \ @@ -164,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); return __ret; \ } __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, - __nvvm_shfl_sync_idx_f32, 0x1f); + __nvvm_shfl_sync_idx_f32, 0x1f, int); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, - __nvvm_shfl_sync_up_f32, 0); + __nvvm_shfl_sync_up_f32, 0, unsigned int); __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, - __nvvm_shfl_sync_down_f32, 0x1f); + __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, - __nvvm_shfl_sync_bfly_f32, 0x1f); + __nvvm_shfl_sync_bfly_f32, 0x1f, int); #pragma pop_macro("__MAKE_SYNC_SHUFFLES") inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {