[libomptarget][nfc] Wrap cuda min() in target_impl

Summary:
[libomptarget][nfc] Wrap cuda min() in target_impl

nvptx forwards to cuda min, amdgcn implements directly.
Sufficient to build parallel.cu for amdgcn, added to CMakeLists.

All call sites are homogenous except one that passes a uint32_t and an
int32_t. This could be smoothed over by taking two type parameters
and some care over the return type, but overall I think the inline
<uint32_t> calling attention to what was an implicit sign conversion
is cleaner.

Reviewers: ABataev, jdoerfert

Reviewed By: jdoerfert

Subscribers: jvesely, mgorny, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D71580
This commit is contained in:
Jon Chesterfield 2019-12-17 01:30:04 +00:00
parent 7a31678b71
commit 53bcd1e141
5 changed files with 14 additions and 5 deletions

View File

@ -59,6 +59,7 @@ set(cuda_sources
${devicertl_base_directory}/common/src/critical.cu
${devicertl_base_directory}/common/src/loop.cu
${devicertl_base_directory}/common/src/omptarget.cu
${devicertl_base_directory}/common/src/parallel.cu
${devicertl_base_directory}/common/src/sync.cu
${devicertl_base_directory}/common/src/task.cu)

View File

@ -109,6 +109,10 @@ INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
return x < y ? x : y;
}
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
return __ballot64(1);
}

View File

@ -72,7 +72,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
// We cannot have more than the # of convergent threads.
if (SimdLimitSource > 0)
*NumLanes = min(ConvergentSize, SimdLimitSource);
*NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
else
*NumLanes = ConvergentSize;
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
@ -149,7 +149,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
// We cannot have more than the # of convergent threads.
uint16_t NumThreads;
if (NumThreadsSource > 0)
NumThreads = min(ConvergentSize, NumThreadsSource);
NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
else
NumThreads = ConvergentSize;
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",

View File

@ -480,14 +480,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// by returning 1 in the thread holding the reduction result.
// Check if this is the very last team.
unsigned NumRecs = min(NumTeams, num_of_records);
unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
if (ChunkTeamCount == NumTeams - Bound - 1) {
//
// Last team processing.
//
if (ThreadId >= NumRecs)
return 0;
NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
if (ThreadId >= NumThreads)
return 0;
@ -502,7 +502,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
uint32_t ActiveThreads = min(NumRecs, NumThreads);
uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
if (ActiveThreads > WARPSIZE) {
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
// Gather all the reduced values from each warp

View File

@ -104,6 +104,10 @@ INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
return min(x, y);
}
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#endif