forked from OSchip/llvm-project
[libomptarget] Refactor shfl_sync macro to inline function
Summary: [libomptarget] Refactor shfl_sync macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66852 llvm-svn: 370144
This commit is contained in:
parent
60a99f4964
commit
b9b712df82
|
@ -10,6 +10,7 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "omptarget-nvptx.h"
|
||||
#include "target_impl.h"
|
||||
#include <stdio.h>
|
||||
|
||||
// Warp ID in the CUDA block
|
||||
|
@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
|
|||
}
|
||||
}
|
||||
// Get address from lane 0.
|
||||
((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
|
||||
int *FP = (int *)&FrameP;
|
||||
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
|
||||
if (sizeof(FrameP) == 8)
|
||||
((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
|
||||
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
|
||||
|
||||
return FrameP;
|
||||
}
|
||||
|
|
|
@ -383,8 +383,8 @@ public:
|
|||
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
|
||||
int lo, hi;
|
||||
__kmpc_impl_unpack(val, lo, hi);
|
||||
hi = __SHFL_SYNC(active, hi, leader);
|
||||
lo = __SHFL_SYNC(active, lo, leader);
|
||||
hi = __kmpc_impl_shfl_sync(active, hi, leader);
|
||||
lo = __kmpc_impl_shfl_sync(active, lo, leader);
|
||||
return __kmpc_impl_pack(lo, hi);
|
||||
}
|
||||
|
||||
|
|
|
@ -51,13 +51,11 @@
|
|||
#ifndef CUDA_VERSION
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
#elif CUDA_VERSION >= 9000
|
||||
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down_sync((mask), (var), (delta), (width))
|
||||
#define __ACTIVEMASK() __activemask()
|
||||
#define __SYNCWARP(Mask) __syncwarp(Mask)
|
||||
#else
|
||||
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down((var), (delta), (width))
|
||||
#define __ACTIVEMASK() __ballot(1)
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
typedef struct ConvergentSimdJob {
|
||||
omptarget_nvptx_TaskDescr taskDescr;
|
||||
|
@ -64,7 +65,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
|
|||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
|
||||
job->slimForNextSimd = SimdLimit;
|
||||
|
||||
int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
|
||||
int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
|
||||
// reset simdlimit to avoid propagating to successive #simd
|
||||
if (SimdLimitSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
|
||||
|
@ -138,7 +139,8 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
|
|||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
|
||||
job->tnumForNextPar = NumThreadsClause;
|
||||
|
||||
int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
|
||||
int32_t NumThreadsSource =
|
||||
__kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
|
||||
// reset numthreads to avoid propagating to successive #parallel
|
||||
if (NumThreadsSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
|
|
|
@ -38,6 +38,20 @@ INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
|
|||
|
||||
INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
|
||||
|
||||
#ifndef CUDA_VERSION
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
#endif
|
||||
|
||||
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
||||
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_sync(Mask, Var, SrcLane);
|
||||
#else
|
||||
return __shfl(Var, SrcLane);
|
||||
#endif // CUDA_VERSION
|
||||
}
|
||||
|
||||
INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue