forked from OSchip/llvm-project
[CUDA] More fixes for __shfl_* intrinsics.
* __shfl_{up,down}* uses unsigned int for the third parameter. * added [unsigned] long overloads for non-sync shuffles. Differential Revision: https://reviews.llvm.org/D41521 llvm-svn: 321326
This commit is contained in:
parent
170de4b4ba
commit
3cebc738b6
|
@ -34,23 +34,24 @@
|
|||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
#pragma push_macro("__MAKE_SHUFFLES")
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
|
||||
inline __device__ int __FnName(int __val, int __offset, \
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
|
||||
__Type) \
|
||||
inline __device__ int __FnName(int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __IntIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(float __val, int __offset, \
|
||||
inline __device__ float __FnName(float __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \
|
||||
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(long long __val, int __offset, \
|
||||
inline __device__ long long __FnName(long long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
|
@ -65,12 +66,29 @@
|
|||
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
|
||||
return __ret; \
|
||||
} \
|
||||
inline __device__ long __FnName(long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
||||
sizeof(long) == sizeof(int)); \
|
||||
if (sizeof(long) == sizeof(long long)) { \
|
||||
return static_cast<long>( \
|
||||
::__FnName(static_cast<long long>(__val), __offset, __width)); \
|
||||
} else if (sizeof(long) == sizeof(int)) { \
|
||||
return static_cast<long>( \
|
||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
} \
|
||||
inline __device__ unsigned long __FnName( \
|
||||
unsigned long __val, __Type __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long>( \
|
||||
::__FnName(static_cast<long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned long long __val, int __offset, int __width = warpSize) { \
|
||||
unsigned long long __val, __Type __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>(::__FnName( \
|
||||
static_cast<unsigned long long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(double __val, int __offset, \
|
||||
inline __device__ double __FnName(double __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
||||
|
@ -81,13 +99,15 @@
|
|||
return __ret; \
|
||||
}
|
||||
|
||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
|
||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
||||
|
||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
|
||||
unsigned int);
|
||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
|
||||
unsigned int);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
|
||||
int);
|
||||
#pragma pop_macro("__MAKE_SHUFFLES")
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
// __shfl_sync_* variants available in CUDA-9
|
||||
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
|
||||
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
|
||||
__Mask) \
|
||||
inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
__Mask, __Type) \
|
||||
inline __device__ int __FnName(unsigned int __mask, int __val, \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
return __IntIntrinsic(__mask, __val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(unsigned int __mask, float __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__mask, __val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __mask, \
|
||||
unsigned int __val, int __offset, \
|
||||
unsigned int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
}; \
|
||||
|
@ -130,13 +151,13 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
return __ret; \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned int __mask, unsigned long long __val, int __offset, \
|
||||
unsigned int __mask, unsigned long long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>(::__FnName( \
|
||||
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long __FnName(unsigned int __mask, long __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
||||
sizeof(long) == sizeof(int)); \
|
||||
if (sizeof(long) == sizeof(long long)) { \
|
||||
|
@ -147,14 +168,14 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
} \
|
||||
inline __device__ unsigned long __FnName(unsigned int __mask, \
|
||||
unsigned long __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
inline __device__ unsigned long __FnName( \
|
||||
unsigned int __mask, unsigned long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned long>( \
|
||||
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(unsigned int __mask, double __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
||||
|
@ -164,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
return __ret; \
|
||||
}
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
|
||||
__nvvm_shfl_sync_idx_f32, 0x1f);
|
||||
__nvvm_shfl_sync_idx_f32, 0x1f, int);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
|
||||
__nvvm_shfl_sync_up_f32, 0);
|
||||
__nvvm_shfl_sync_up_f32, 0, unsigned int);
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
|
||||
__nvvm_shfl_sync_down_f32, 0x1f);
|
||||
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
|
||||
__nvvm_shfl_sync_bfly_f32, 0x1f);
|
||||
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
|
||||
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
|
||||
|
||||
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
|
||||
|
|
Loading…
Reference in New Issue