forked from OSchip/llvm-project
[OpenMP][DeviceRTL] Extract shuffle idiom and port it to declare variant
The shuffle idiom is differently implemented in our supported targets. To reduce the "target_impl" file we now move the shuffle idiom in it's own self-contained header that provides the implementation for AMDGPU and NVPTX. A fallback can be added later on. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95752
This commit is contained in:
parent
b2642456ab
commit
66ba494b49
|
@ -68,6 +68,7 @@ set(cuda_sources
|
|||
${devicertl_base_directory}/common/src/parallel.cu
|
||||
${devicertl_base_directory}/common/src/reduction.cu
|
||||
${devicertl_base_directory}/common/src/support.cu
|
||||
${devicertl_base_directory}/common/src/shuffle.cpp
|
||||
${devicertl_base_directory}/common/src/sync.cu
|
||||
${devicertl_base_directory}/common/src/task.cu)
|
||||
|
||||
|
@ -112,6 +113,7 @@ macro(add_cuda_bc_library)
|
|||
-O${optimization_level}
|
||||
${CUDA_DEBUG}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||
-I${devicertl_base_directory}/common/include
|
||||
-I${devicertl_base_directory})
|
||||
|
||||
set(bc1_files)
|
||||
|
|
|
@ -52,22 +52,6 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
|||
return __builtin_amdgcn_read_exec();
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
|
||||
int32_t srcLane) {
|
||||
int width = WARPSIZE;
|
||||
int self = GetLaneId();
|
||||
int index = srcLane + (self & ~(width - 1));
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
|
||||
uint32_t laneDelta, int32_t width) {
|
||||
int self = GetLaneId();
|
||||
int index = self + laneDelta;
|
||||
index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
|
||||
uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
|
||||
#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Shuffle function implementations for all supported targets.
|
||||
//
|
||||
// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
|
||||
#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
/// External shuffle API
|
||||
///
|
||||
///{
|
||||
|
||||
extern "C" {
|
||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
}
|
||||
|
||||
///}
|
||||
|
||||
/// Forward declarations
|
||||
///
|
||||
///{
|
||||
unsigned GetLaneId();
|
||||
unsigned GetWarpSize();
|
||||
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||
///}
|
||||
|
||||
/// Fallback implementations of the shuffle sync idiom.
|
||||
///
|
||||
///{
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
assert(false &&
|
||||
"Fallback version of __kmpc_impl_shfl_sync is not available!");
|
||||
}
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t Delta, int32_t Width) {
|
||||
assert(false &&
|
||||
"Fallback version of __kmpc_impl_shfl_down_sync is not available!");
|
||||
}
|
||||
|
||||
///}
|
||||
|
||||
/// AMDGCN implementations of the shuffle sync idiom.
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
int Width = GetWarpSize();
|
||||
int Self = GetLaneId();
|
||||
int Index = SrcLane + (Self & ~(Width - 1));
|
||||
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
|
||||
}
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t LaneDelta, int32_t Width) {
|
||||
int Self = GetLaneId();
|
||||
int Index = Self + LaneDelta;
|
||||
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
|
||||
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
///}
|
||||
|
||||
/// NVPTX implementations of the shuffle and shuffle sync idiom.
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
|
||||
}
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t Delta, int32_t Width) {
|
||||
int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
|
||||
return __nvvm_shfl_down_i32(Var, Delta, T);
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
///}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
|
@ -12,6 +12,7 @@
|
|||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
// Return true if this is the master thread.
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
EXTERN
|
||||
|
@ -20,18 +21,6 @@ void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
|
|||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
|
||||
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
||||
return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
|
||||
}
|
||||
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
uint32_t lo, hi;
|
||||
__kmpc_impl_unpack(val, lo, hi);
|
||||
hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
|
||||
lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
|
||||
return __kmpc_impl_pack(lo, hi);
|
||||
}
|
||||
|
||||
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct) {
|
||||
for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "target/shuffle.h"
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
static constexpr uint64_t AllLanes = -1;
|
||||
|
||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
||||
return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
|
||||
}
|
||||
|
||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
uint32_t lo, hi;
|
||||
__kmpc_impl_unpack(val, lo, hi);
|
||||
hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
|
||||
lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
|
||||
return __kmpc_impl_pack(lo, hi);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -128,6 +128,7 @@ set(cuda_src_files
|
|||
${devicertl_common_directory}/src/support.cu
|
||||
${devicertl_common_directory}/src/sync.cu
|
||||
${devicertl_common_directory}/src/task.cu
|
||||
${devicertl_common_directory}/src/shuffle.cpp
|
||||
src/target_impl.cu
|
||||
)
|
||||
|
||||
|
@ -140,6 +141,7 @@ set(bc_flags -S -x c++ -O1 -std=c++14
|
|||
-Xclang -target-feature -Xclang +ptx61
|
||||
-D__CUDACC__
|
||||
-I${devicertl_base_directory}
|
||||
-I${devicertl_common_directory}/include
|
||||
-I${devicertl_nvptx_directory}/src)
|
||||
|
||||
if(${LIBOMPTARGET_NVPTX_DEBUG})
|
||||
|
|
|
@ -59,18 +59,6 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
|||
return Mask;
|
||||
}
|
||||
|
||||
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
|
||||
}
|
||||
|
||||
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
|
||||
int32_t Var, uint32_t Delta,
|
||||
int32_t Width) {
|
||||
int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
|
||||
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
|
||||
|
||||
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
||||
|
|
|
@ -57,12 +57,6 @@ EXTERN uint32_t __kmpc_impl_smid();
|
|||
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
|
||||
|
||||
EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
||||
int32_t SrcLane);
|
||||
EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
|
||||
int32_t Var, uint32_t Delta,
|
||||
int32_t Width);
|
||||
|
||||
EXTERN void __kmpc_impl_syncthreads();
|
||||
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
|
||||
|
||||
|
|
Loading…
Reference in New Issue