[libomptarget] Implement target_impl for amdgcn

Summary: [libomptarget] Implement target_impl for amdgcn Smallest atomic addition for a new target. Implements enough of the amdgcn specific code that some of the source files under nvptx/src could be compiled, without modification, to run on amdgcn. This foreshadows a work in progress patch to move said source out of nvptx/src. Patch based on fork at https://github.com/ROCm-Developer-Tools/llvm-project Reviewers: ABataev, jdoerfert, grokos, ronlieb Subscribers: jvesely, jfb, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D69718
2019-11-01 14:58:39 +00:00 · 2019-11-01 14:58:39 +00:00 · 94c59ea8dd
parent 56c22931bd
commit 94c59ea8dd
3 changed files with 151 additions and 0 deletions
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@ -0,0 +1,17 @@
+//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGCN_INTERFACE_H_
+#define _AMDGCN_INTERFACE_H_
+
+#include <stdint.h>
+
+#define EXTERN extern "C" __attribute__((device))
+typedef uint64_t __kmpc_impl_lanemask_t;
+
+#endif
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -0,0 +1,131 @@
+//===------------ target_impl.h - AMDGCN OpenMP GPU options ------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+#ifndef _TARGET_IMPL_H_
+#define _TARGET_IMPL_H_
+
+#ifndef __AMDGCN__
+#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
+#endif
+
+#include <stdint.h>
+#include "amdgcn_interface.h"
+
+#define DEVICE __device__
+#define INLINE __inline__ DEVICE
+#define NOINLINE __noinline__ DEVICE
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 64
+
+
+// The named barrier for active parallel threads of a team in an L1 parallel
+// region to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#define OMP_STATE_COUNT 32
+#define MAX_SM 64
+
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 960,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 16,
+};
+
+// warp vote function
+EXTERN uint64_t __ballot64(int predicate);
+// initialized with a 64-bit mask with bits set in positions less than the
+// thread's lane number in the warp
+EXTERN uint64_t __lanemask_lt();
+// initialized with a 64-bit mask with bits set in positions greater than the
+// thread's lane number in the warp
+EXTERN uint64_t __lanemask_gt();
+
+EXTERN void llvm_amdgcn_s_barrier();
+
+// CU id
+EXTERN unsigned __smid();
+
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
+  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  return (((uint64_t)hi) << 32) | (uint64_t)lo;
+}
+
+static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
+    UINT64_C(0xffffffffffffffff);
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  return __lanemask_lt();
+}
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  return __lanemask_gt();
+}
+
+INLINE uint32_t __kmpc_impl_smid() {
+  return __smid();
+}
+
+INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __ffsll(x); }
+
+INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __popcll(x); }
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+  return __ballot64(1);
+}
+
+INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                     int32_t SrcLane) {
+  return __shfl(Var, SrcLane, WARPSIZE);
+}
+
+INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  return __shfl_down(Var, Delta, Width);
+}
+
+INLINE void __kmpc_impl_syncthreads() { llvm_amdgcn_s_barrier(); }
+
+INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
+  // we have protected the master warp from releasing from its barrier
+  // due to a full workgroup barrier in the middle of a work function.
+  // So it is ok to issue a full workgroup barrier here.
+  __builtin_amdgcn_s_barrier();
+}
+
+#endif
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@ -19,6 +19,9 @@
 #include <stddef.h>
 #include <stdint.h>

+#ifdef __AMDGCN__
+#include "amdgcn/src/amdgcn_interface.h"
+#endif
 #ifdef __CUDACC__
 #include "nvptx/src/nvptx_interface.h"
 #endif