forked from OSchip/llvm-project
[libomptarget] Refactor shfl_down_sync macro to inline function
Summary: [libomptarget] Refactor shfl_down_sync macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66853 llvm-svn: 370146
This commit is contained in:
parent
4a44569cbd
commit
327aa81123
|
@ -51,13 +51,9 @@
|
|||
#ifndef CUDA_VERSION
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
#elif CUDA_VERSION >= 9000
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down_sync((mask), (var), (delta), (width))
|
||||
#define __ACTIVEMASK() __activemask()
|
||||
#define __SYNCWARP(Mask) __syncwarp(Mask)
|
||||
#else
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down((var), (delta), (width))
|
||||
#define __ACTIVEMASK() __ballot(1)
|
||||
// In Cuda < 9.0 no need to sync threads in warps.
|
||||
#define __SYNCWARP(Mask)
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
|
||||
|
@ -23,14 +24,14 @@ EXTERN
|
|||
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
|
||||
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
||||
return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
|
||||
return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size);
|
||||
}
|
||||
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
int lo, hi;
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
|
||||
lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
|
||||
hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
|
||||
lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
|
|||
#endif
|
||||
|
||||
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
||||
|
||||
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
@ -50,6 +51,15 @@ INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
|||
#else
|
||||
return __shfl(Var, SrcLane);
|
||||
#endif // CUDA_VERSION
|
||||
|
||||
INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
|
||||
int32_t Var, uint32_t Delta,
|
||||
int32_t Width) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_down_sync(Mask, Var, Delta, Width);
|
||||
#else
|
||||
return __shfl_down(Var, Delta, Width);
|
||||
#endif // CUDA_VERSION
|
||||
}
|
||||
|
||||
INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
|
||||
|
|
Loading…
Reference in New Issue