forked from OSchip/llvm-project
[CUDA] Implement __shfl* intrinsics in clang headers.
Summary: Clang changes to make use of the LLVM intrinsics added in D21160. Reviewers: tra Subscribers: jholewinski, cfe-commits Differential Revision: http://reviews.llvm.org/D21162 llvm-svn: 272299
This commit is contained in:
parent
ed2c282d4b
commit
4fb5711751
|
@ -402,6 +402,17 @@ BUILTIN(__nvvm_bar0_popc, "ii", "")
|
|||
BUILTIN(__nvvm_bar0_and, "ii", "")
|
||||
BUILTIN(__nvvm_bar0_or, "ii", "")
|
||||
|
||||
// Shuffle
|
||||
|
||||
BUILTIN(__builtin_ptx_shfl_down_i32, "iiii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_down_f32, "ffii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_up_i32, "iiii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_up_f32, "ffii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_bfly_i32, "iiii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_bfly_f32, "ffii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_idx_i32, "iiii", "")
|
||||
BUILTIN(__builtin_ptx_shfl_idx_f32, "ffii", "")
|
||||
|
||||
// Membar
|
||||
|
||||
BUILTIN(__nvvm_membar_cta, "v", "")
|
||||
|
|
|
@ -26,6 +26,76 @@
|
|||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
// sm_30 intrinsics: __shfl_{up,down,xor}.
|
||||
|
||||
#define __SM_30_INTRINSICS_H__
|
||||
#define __SM_30_INTRINSICS_HPP__
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
#pragma push_macro("__MAKE_SHUFFLES")
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
|
||||
inline __device__ int __FnName(int __in, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __IntIntrinsic(__in, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(float __in, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__in, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(static_cast<int>(__in), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(long long __in, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
}; \
|
||||
_Static_assert(sizeof(__in) == sizeof(__Bits)); \
|
||||
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
|
||||
__Bits __tmp; \
|
||||
memcpy(&__in, &__tmp, sizeof(__in)); \
|
||||
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
|
||||
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
|
||||
long long __out; \
|
||||
memcpy(&__out, &__tmp, sizeof(__tmp)); \
|
||||
return __out; \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned long long __in, int __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>( \
|
||||
::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(double __in, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__in)); \
|
||||
memcpy(&__tmp, &__in, sizeof(__in)); \
|
||||
__tmp = ::__FnName(__tmp, __offset, __width); \
|
||||
double __out; \
|
||||
memcpy(&__out, &__tmp, sizeof(__out)); \
|
||||
return __out; \
|
||||
}
|
||||
|
||||
__MAKE_SHUFFLES(__shfl, __builtin_ptx_shfl_idx_i32, __builtin_ptx_shfl_idx_f32,
|
||||
0x1f);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SHUFFLES(__shfl_up, __builtin_ptx_shfl_up_i32, __builtin_ptx_shfl_up_f32,
|
||||
0);
|
||||
__MAKE_SHUFFLES(__shfl_down, __builtin_ptx_shfl_down_i32,
|
||||
__builtin_ptx_shfl_down_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __builtin_ptx_shfl_bfly_i32,
|
||||
__builtin_ptx_shfl_bfly_f32, 0x1f);
|
||||
|
||||
#pragma pop_macro("__MAKE_SHUFFLES")
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
|
||||
|
||||
// Prevent the vanilla sm_32 intrinsics header from being included.
|
||||
|
|
|
@ -198,13 +198,14 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
|||
#include "sm_20_atomic_functions.hpp"
|
||||
#include "sm_20_intrinsics.hpp"
|
||||
#include "sm_32_atomic_functions.hpp"
|
||||
// sm_30_intrinsics.h has declarations that use default argument, so
|
||||
// we have to include it and it will in turn include .hpp
|
||||
#include "sm_30_intrinsics.h"
|
||||
|
||||
// Don't include sm_32_intrinsics.h. That header defines __ldg using inline
|
||||
// asm, but we want to define it using builtins, because we can't use the
|
||||
// [addr+imm] addressing mode if we use the inline asm in the header.
|
||||
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
|
||||
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
|
||||
// define them using builtins so that the optimizer can reason about and across
|
||||
// these instructions. In particular, using intrinsics for ldg gets us the
|
||||
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
|
||||
// hardware, seems to generate faster machine code because ptxas can more easily
|
||||
// reason about our code.
|
||||
|
||||
#undef __MATH_FUNCTIONS_HPP__
|
||||
|
||||
|
|
Loading…
Reference in New Issue