forked from OSchip/llvm-project
[libomptarget] Fix devicertl build
[libomptarget] Fix devicertl build The target specific functions in target_interface are extern C, but the implementations for nvptx were mostly C++ mangling. That worked out as a quirk of DEVICE macro expanding to nothing, except for shuffle.h which only forward declared the functions with C++ linkage. Also implements GetWarpSize, as used by shuffle, and includes target_interface in nvptx target_impl.cu to help catch future divergence between interface and implementation. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D98651
This commit is contained in:
parent
9c486eb348
commit
bcb3f0f867
|
@ -130,6 +130,7 @@ EXTERN int GetNumberOfThreadsInBlock() {
|
|||
}
|
||||
|
||||
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||
EXTERN unsigned GetLaneId() {
|
||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
||||
}
|
||||
|
|
|
@ -33,10 +33,12 @@ int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
|||
/// Forward declarations
|
||||
///
|
||||
///{
|
||||
extern "C" {
|
||||
unsigned GetLaneId();
|
||||
unsigned GetWarpSize();
|
||||
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||
}
|
||||
///}
|
||||
|
||||
/// Fallback implementations of the shuffle sync idiom.
|
||||
|
|
|
@ -13,64 +13,65 @@
|
|||
|
||||
#include "common/debug.h"
|
||||
#include "target_impl.h"
|
||||
#include "target_interface.h"
|
||||
|
||||
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
}
|
||||
|
||||
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
uint64_t val;
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
}
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||
__kmpc_impl_lanemask_t res;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
|
||||
return res;
|
||||
}
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||
__kmpc_impl_lanemask_t res;
|
||||
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
|
||||
return res;
|
||||
}
|
||||
|
||||
DEVICE uint32_t __kmpc_impl_smid() {
|
||||
EXTERN uint32_t __kmpc_impl_smid() {
|
||||
uint32_t id;
|
||||
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
||||
return id;
|
||||
}
|
||||
|
||||
DEVICE double __kmpc_impl_get_wtick() {
|
||||
EXTERN double __kmpc_impl_get_wtick() {
|
||||
// Timer precision is 1ns
|
||||
return ((double)1E-9);
|
||||
}
|
||||
|
||||
DEVICE double __kmpc_impl_get_wtime() {
|
||||
EXTERN double __kmpc_impl_get_wtime() {
|
||||
unsigned long long nsecs;
|
||||
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
|
||||
return (double)nsecs * __kmpc_impl_get_wtick();
|
||||
}
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
unsigned int Mask;
|
||||
asm volatile("activemask.b32 %0;" : "=r"(Mask));
|
||||
return Mask;
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
|
||||
EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); }
|
||||
|
||||
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
||||
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
||||
__nvvm_bar_warp_sync(Mask);
|
||||
}
|
||||
|
||||
// NVPTX specific kernel initialization
|
||||
DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
|
||||
EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
|
||||
}
|
||||
|
||||
// Barrier until num_threads arrive.
|
||||
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
// The named barrier for active parallel threads of a team in an L1 parallel
|
||||
// region to synchronize with each other.
|
||||
int barrier = 1;
|
||||
|
@ -80,19 +81,20 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
|
|||
: "memory");
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
|
||||
DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
|
||||
DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
|
||||
EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
|
||||
EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
|
||||
EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
|
||||
|
||||
// Calls to the NVPTX layer (assuming 1D layout)
|
||||
DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
|
||||
DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
|
||||
DEVICE int GetNumberOfBlocksInKernel() {
|
||||
EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
|
||||
EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
|
||||
EXTERN int GetNumberOfBlocksInKernel() {
|
||||
return __nvvm_read_ptx_sreg_nctaid_x();
|
||||
}
|
||||
DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
|
||||
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
|
||||
EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
|
||||
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||
EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
|
||||
|
||||
// Atomics
|
||||
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
|
||||
|
@ -135,15 +137,15 @@ DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
|
|||
#define UNSET 0u
|
||||
#define SET 1u
|
||||
|
||||
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
|
||||
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_unset_lock(lock);
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
|
||||
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_unset_lock(lock);
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
||||
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
||||
// TODO: not sure spinning is a good idea here..
|
||||
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
|
||||
int32_t start = __nvvm_read_ptx_sreg_clock();
|
||||
|
@ -158,15 +160,15 @@ DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
|||
} // wait for 0 to be the read value
|
||||
}
|
||||
|
||||
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
|
||||
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
|
||||
(void)__kmpc_atomic_exchange(lock, UNSET);
|
||||
}
|
||||
|
||||
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
||||
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
||||
return __kmpc_atomic_add(lock, 0u);
|
||||
}
|
||||
|
||||
DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
|
||||
DEVICE void __kmpc_impl_free(void *x) { free(x); }
|
||||
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
|
||||
EXTERN void __kmpc_impl_free(void *x) { free(x); }
|
||||
|
||||
#pragma omp end declare target
|
||||
|
|
|
@ -21,18 +21,19 @@ EXTERN int GetBlockIdInKernel();
|
|||
EXTERN int GetNumberOfBlocksInKernel();
|
||||
EXTERN int GetNumberOfThreadsInBlock();
|
||||
EXTERN unsigned GetWarpId();
|
||||
EXTERN unsigned GetWarpSize();
|
||||
EXTERN unsigned GetLaneId();
|
||||
|
||||
// Atomics
|
||||
extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
|
||||
extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
|
||||
extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
|
||||
extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
|
||||
extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
|
||||
DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
|
||||
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
|
||||
DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
|
||||
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
|
||||
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
|
||||
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
|
||||
unsigned long long);
|
||||
extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
|
||||
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
|
||||
unsigned long long);
|
||||
|
||||
// Locks
|
||||
|
|
Loading…
Reference in New Issue