[OpenMP][DeviceRTL] Extract shuffle idiom and port it to declare variant

The shuffle idiom is differently implemented in our supported targets. To reduce the "target_impl" file we now move the shuffle idiom in it's own self-contained header that provides the implementation for AMDGPU and NVPTX. A fallback can be added later on. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95752
2021-01-30 14:46:58 -06:00 · 2021-01-30 14:46:58 -06:00 · 66ba494b49
parent b2642456ab
commit 66ba494b49
10 changed files with 143 additions and 46 deletions
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@ -68,6 +68,7 @@ set(cuda_sources
  ${devicertl_base_directory}/common/src/parallel.cu
  ${devicertl_base_directory}/common/src/reduction.cu
  ${devicertl_base_directory}/common/src/support.cu
+  ${devicertl_base_directory}/common/src/shuffle.cpp
  ${devicertl_base_directory}/common/src/sync.cu
  ${devicertl_base_directory}/common/src/task.cu)

@ -112,6 +113,7 @@ macro(add_cuda_bc_library)
    -O${optimization_level}
    ${CUDA_DEBUG}
    -I${CMAKE_CURRENT_SOURCE_DIR}/src
+    -I${devicertl_base_directory}/common/include
    -I${devicertl_base_directory})

  set(bc1_files)
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@ -52,22 +52,6 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
  return __builtin_amdgcn_read_exec();
 }

-EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                     int32_t srcLane) {
-  int width = WARPSIZE;
-  int self = GetLaneId();
-  int index = srcLane + (self & ~(width - 1));
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
-EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                          uint32_t laneDelta, int32_t width) {
-  int self = GetLaneId();
-  int index = self + laneDelta;
-  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
 uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
 #pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)

--- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
+++ b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
@ -0,0 +1,107 @@
+//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shuffle function implementations for all supported targets.
+//
+// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+
+#include <assert.h>
+#include <inttypes.h>
+
+#pragma omp declare target
+
+/// External shuffle API
+///
+///{
+
+extern "C" {
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+}
+
+///}
+
+/// Forward declarations
+///
+///{
+unsigned GetLaneId();
+unsigned GetWarpSize();
+void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+///}
+
+/// Fallback implementations of the shuffle sync idiom.
+///
+///{
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  assert(false &&
+         "Fallback version of __kmpc_impl_shfl_sync is not available!");
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  assert(false &&
+         "Fallback version of __kmpc_impl_shfl_down_sync is not available!");
+}
+
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  int Width = GetWarpSize();
+  int Self = GetLaneId();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t LaneDelta, int32_t Width) {
+  int Self = GetLaneId();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX implementations of the shuffle and shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
+  return __nvvm_shfl_down_i32(Var, Delta, T);
+}
+
+#pragma omp end declare variant
+///}
+
+#pragma omp end declare target
+
+#endif
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@ -12,6 +12,7 @@
 #pragma omp declare target

 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"

 // Return true if this is the master thread.
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@ -14,6 +14,7 @@
 #pragma omp declare target

 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"

 ////////////////////////////////////////////////////////////////////////////////
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@ -12,6 +12,7 @@
 #pragma omp declare target

 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"

 EXTERN
@ -20,18 +21,6 @@ void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
 EXTERN
 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}

-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
-}
-
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-  uint32_t lo, hi;
-  __kmpc_impl_unpack(val, lo, hi);
-  hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
-  lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
-  return __kmpc_impl_pack(lo, hi);
-}
-
 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
                                           kmp_ShuffleReductFctPtr shflFct) {
  for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
--- a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
+++ b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
@ -0,0 +1,29 @@
+//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "target/shuffle.h"
+
+#pragma omp declare target
+
+static constexpr uint64_t AllLanes = -1;
+
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
+}
+
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+  uint32_t lo, hi;
+  __kmpc_impl_unpack(val, lo, hi);
+  hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
+  lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
+  return __kmpc_impl_pack(lo, hi);
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@ -128,6 +128,7 @@ set(cuda_src_files
  ${devicertl_common_directory}/src/support.cu
  ${devicertl_common_directory}/src/sync.cu
  ${devicertl_common_directory}/src/task.cu
+  ${devicertl_common_directory}/src/shuffle.cpp
  src/target_impl.cu
 )

@ -140,6 +141,7 @@ set(bc_flags -S -x c++ -O1 -std=c++14
             -Xclang -target-feature -Xclang +ptx61
             -D__CUDACC__
             -I${devicertl_base_directory}
+             -I${devicertl_common_directory}/include
             -I${devicertl_nvptx_directory}/src)

 if(${LIBOMPTARGET_NVPTX_DEBUG})
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -59,18 +59,6 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
  return Mask;
 }

-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-  int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }

 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@ -57,12 +57,6 @@ EXTERN uint32_t __kmpc_impl_smid();

 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();

-EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane);
-EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width);
-
 EXTERN void __kmpc_impl_syncthreads();
 EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);