forked from OSchip/llvm-project
[libomptarget] Implement target_impl for amdgcn
Summary: [libomptarget] Implement target_impl for amdgcn Smallest atomic addition for a new target. Implements enough of the amdgcn specific code that some of the source files under nvptx/src could be compiled, without modification, to run on amdgcn. This foreshadows a work in progress patch to move said source out of nvptx/src. Patch based on fork at https://github.com/ROCm-Developer-Tools/llvm-project Reviewers: ABataev, jdoerfert, grokos, ronlieb Subscribers: jvesely, jfb, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D69718
This commit is contained in:
parent
56c22931bd
commit
94c59ea8dd
|
@ -0,0 +1,17 @@
|
|||
//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _AMDGCN_INTERFACE_H_
|
||||
#define _AMDGCN_INTERFACE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define EXTERN extern "C" __attribute__((device))
|
||||
typedef uint64_t __kmpc_impl_lanemask_t;
|
||||
|
||||
#endif
|
|
@ -0,0 +1,131 @@
|
|||
//===------------ target_impl.h - AMDGCN OpenMP GPU options ------ CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Definitions of target specific functions
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef _TARGET_IMPL_H_
|
||||
#define _TARGET_IMPL_H_
|
||||
|
||||
#ifndef __AMDGCN__
|
||||
#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "amdgcn_interface.h"
|
||||
|
||||
#define DEVICE __device__
|
||||
#define INLINE __inline__ DEVICE
|
||||
#define NOINLINE __noinline__ DEVICE
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
#define MAX_THREADS_PER_TEAM 1024
|
||||
|
||||
#define WARPSIZE 64
|
||||
|
||||
|
||||
// The named barrier for active parallel threads of a team in an L1 parallel
|
||||
// region to synchronize with each other.
|
||||
#define L1_BARRIER (1)
|
||||
|
||||
// Maximum number of preallocated arguments to an outlined parallel/simd function.
|
||||
// Anything more requires dynamic memory allocation.
|
||||
#define MAX_SHARED_ARGS 20
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
#define OMP_STATE_COUNT 32
|
||||
#define MAX_SM 64
|
||||
|
||||
|
||||
#define OMP_ACTIVE_PARALLEL_LEVEL 128
|
||||
|
||||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The maximum number of workers in a kernel.
|
||||
DS_Max_Worker_Threads = 960,
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = 256,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = 16,
|
||||
};
|
||||
|
||||
// warp vote function
|
||||
EXTERN uint64_t __ballot64(int predicate);
|
||||
// initialized with a 64-bit mask with bits set in positions less than the
|
||||
// thread's lane number in the warp
|
||||
EXTERN uint64_t __lanemask_lt();
|
||||
// initialized with a 64-bit mask with bits set in positions greater than the
|
||||
// thread's lane number in the warp
|
||||
EXTERN uint64_t __lanemask_gt();
|
||||
|
||||
EXTERN void llvm_amdgcn_s_barrier();
|
||||
|
||||
// CU id
|
||||
EXTERN unsigned __smid();
|
||||
|
||||
INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
|
||||
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
|
||||
}
|
||||
|
||||
INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
return (((uint64_t)hi) << 32) | (uint64_t)lo;
|
||||
}
|
||||
|
||||
static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
|
||||
UINT64_C(0xffffffffffffffff);
|
||||
|
||||
INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||
return __lanemask_lt();
|
||||
}
|
||||
|
||||
INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||
return __lanemask_gt();
|
||||
}
|
||||
|
||||
INLINE uint32_t __kmpc_impl_smid() {
|
||||
return __smid();
|
||||
}
|
||||
|
||||
INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __ffsll(x); }
|
||||
|
||||
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __popcll(x); }
|
||||
|
||||
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
return __ballot64(1);
|
||||
}
|
||||
|
||||
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
return __shfl(Var, SrcLane, WARPSIZE);
|
||||
}
|
||||
|
||||
INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
|
||||
uint32_t Delta, int32_t Width) {
|
||||
return __shfl_down(Var, Delta, Width);
|
||||
}
|
||||
|
||||
INLINE void __kmpc_impl_syncthreads() { llvm_amdgcn_s_barrier(); }
|
||||
|
||||
INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
|
||||
// we have protected the master warp from releasing from its barrier
|
||||
// due to a full workgroup barrier in the middle of a work function.
|
||||
// So it is ok to issue a full workgroup barrier here.
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -19,6 +19,9 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __AMDGCN__
|
||||
#include "amdgcn/src/amdgcn_interface.h"
|
||||
#endif
|
||||
#ifdef __CUDACC__
|
||||
#include "nvptx/src/nvptx_interface.h"
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue