[CUDA] Implement __shfl* intrinsics in clang headers.

Summary: Clang changes to make use of the LLVM intrinsics added in D21160.

Reviewers: tra

Subscribers: jholewinski, cfe-commits

Differential Revision: http://reviews.llvm.org/D21162

llvm-svn: 272299
This commit is contained in:
Justin Lebar 2016-06-09 20:04:57 +00:00
parent ed2c282d4b
commit 4fb5711751
3 changed files with 88 additions and 6 deletions

View File

@ -402,6 +402,17 @@ BUILTIN(__nvvm_bar0_popc, "ii", "")
BUILTIN(__nvvm_bar0_and, "ii", "")
BUILTIN(__nvvm_bar0_or, "ii", "")
// Shuffle
BUILTIN(__builtin_ptx_shfl_down_i32, "iiii", "")
BUILTIN(__builtin_ptx_shfl_down_f32, "ffii", "")
BUILTIN(__builtin_ptx_shfl_up_i32, "iiii", "")
BUILTIN(__builtin_ptx_shfl_up_f32, "ffii", "")
BUILTIN(__builtin_ptx_shfl_bfly_i32, "iiii", "")
BUILTIN(__builtin_ptx_shfl_bfly_f32, "ffii", "")
BUILTIN(__builtin_ptx_shfl_idx_i32, "iiii", "")
BUILTIN(__builtin_ptx_shfl_idx_f32, "ffii", "")
// Membar
BUILTIN(__nvvm_membar_cta, "v", "")

View File

@ -26,6 +26,76 @@
#error "This file is for CUDA compilation only."
#endif
// sm_30 intrinsics: __shfl_{up,down,xor}.
#define __SM_30_INTRINSICS_H__
#define __SM_30_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#pragma push_macro("__MAKE_SHUFFLES")
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
inline __device__ int __FnName(int __in, int __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__in, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(float __in, int __offset, \
int __width = warpSize) { \
return __FloatIntrinsic(__in, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(static_cast<int>(__in), __offset, __width)); \
} \
inline __device__ long long __FnName(long long __in, int __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__in) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__in, &__tmp, sizeof(__in)); \
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
long long __out; \
memcpy(&__out, &__tmp, sizeof(__tmp)); \
return __out; \
} \
inline __device__ unsigned long long __FnName( \
unsigned long long __in, int __offset, int __width = warpSize) { \
return static_cast<unsigned long long>( \
::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
} \
inline __device__ double __FnName(double __in, int __offset, \
int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__in)); \
memcpy(&__tmp, &__in, sizeof(__in)); \
__tmp = ::__FnName(__tmp, __offset, __width); \
double __out; \
memcpy(&__out, &__tmp, sizeof(__out)); \
return __out; \
}
__MAKE_SHUFFLES(__shfl, __builtin_ptx_shfl_idx_i32, __builtin_ptx_shfl_idx_f32,
0x1f);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SHUFFLES(__shfl_up, __builtin_ptx_shfl_up_i32, __builtin_ptx_shfl_up_f32,
0);
__MAKE_SHUFFLES(__shfl_down, __builtin_ptx_shfl_down_i32,
__builtin_ptx_shfl_down_f32, 0x1f);
__MAKE_SHUFFLES(__shfl_xor, __builtin_ptx_shfl_bfly_i32,
__builtin_ptx_shfl_bfly_f32, 0x1f);
#pragma pop_macro("__MAKE_SHUFFLES")
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.

View File

@ -198,13 +198,14 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
#include "sm_20_atomic_functions.hpp"
#include "sm_20_intrinsics.hpp"
#include "sm_32_atomic_functions.hpp"
// sm_30_intrinsics.h has declarations that use default argument, so
// we have to include it and it will in turn include .hpp
#include "sm_30_intrinsics.h"
// Don't include sm_32_intrinsics.h. That header defines __ldg using inline
// asm, but we want to define it using builtins, because we can't use the
// [addr+imm] addressing mode if we use the inline asm in the header.
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
// define them using builtins so that the optimizer can reason about and across
// these instructions. In particular, using intrinsics for ldg gets us the
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
// hardware, seems to generate faster machine code because ptxas can more easily
// reason about our code.
#undef __MATH_FUNCTIONS_HPP__