[libomptarget] Fix devicertl build

[libomptarget] Fix devicertl build

The target specific functions in target_interface are extern C, but the
implementations for nvptx were mostly C++ mangling. That worked out as
a quirk of DEVICE macro expanding to nothing, except for shuffle.h which
only forward declared the functions with C++ linkage.

Also implements GetWarpSize, as used by shuffle, and includes target_interface
in nvptx target_impl.cu to help catch future divergence between interface and
implementation.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D98651
This commit is contained in:
Jon Chesterfield 2021-03-15 19:50:21 +00:00
parent 9c486eb348
commit bcb3f0f867
4 changed files with 43 additions and 37 deletions

View File

@ -130,6 +130,7 @@ EXTERN int GetNumberOfThreadsInBlock() {
}
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
EXTERN unsigned GetLaneId() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}

View File

@ -33,10 +33,12 @@ int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
/// Forward declarations
///
///{
extern "C" {
unsigned GetLaneId();
unsigned GetWarpSize();
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
}
///}
/// Fallback implementations of the shuffle sync idiom.

View File

@ -13,64 +13,65 @@
#include "common/debug.h"
#include "target_impl.h"
#include "target_interface.h"
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
}
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
uint64_t val;
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
}
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
__kmpc_impl_lanemask_t res;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
return res;
}
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
__kmpc_impl_lanemask_t res;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
return res;
}
DEVICE uint32_t __kmpc_impl_smid() {
EXTERN uint32_t __kmpc_impl_smid() {
uint32_t id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
return id;
}
DEVICE double __kmpc_impl_get_wtick() {
EXTERN double __kmpc_impl_get_wtick() {
// Timer precision is 1ns
return ((double)1E-9);
}
DEVICE double __kmpc_impl_get_wtime() {
EXTERN double __kmpc_impl_get_wtime() {
unsigned long long nsecs;
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
return (double)nsecs * __kmpc_impl_get_wtick();
}
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
unsigned int Mask;
asm volatile("activemask.b32 %0;" : "=r"(Mask));
return Mask;
}
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); }
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
__nvvm_bar_warp_sync(Mask);
}
// NVPTX specific kernel initialization
DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
}
// Barrier until num_threads arrive.
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
int barrier = 1;
@ -80,19 +81,20 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
: "memory");
}
DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
// Calls to the NVPTX layer (assuming 1D layout)
DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
DEVICE int GetNumberOfBlocksInKernel() {
EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
EXTERN int GetNumberOfBlocksInKernel() {
return __nvvm_read_ptx_sreg_nctaid_x();
}
DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
// Atomics
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
@ -135,15 +137,15 @@ DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
#define UNSET 0u
#define SET 1u
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
__kmpc_impl_unset_lock(lock);
}
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
__kmpc_impl_unset_lock(lock);
}
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
// TODO: not sure spinning is a good idea here..
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
int32_t start = __nvvm_read_ptx_sreg_clock();
@ -158,15 +160,15 @@ DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
} // wait for 0 to be the read value
}
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
(void)__kmpc_atomic_exchange(lock, UNSET);
}
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
return __kmpc_atomic_add(lock, 0u);
}
DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
DEVICE void __kmpc_impl_free(void *x) { free(x); }
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
EXTERN void __kmpc_impl_free(void *x) { free(x); }
#pragma omp end declare target

View File

@ -21,18 +21,19 @@ EXTERN int GetBlockIdInKernel();
EXTERN int GetNumberOfBlocksInKernel();
EXTERN int GetNumberOfThreadsInBlock();
EXTERN unsigned GetWarpId();
EXTERN unsigned GetWarpSize();
EXTERN unsigned GetLaneId();
// Atomics
extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
unsigned long long);
extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
unsigned long long);
// Locks