forked from OSchip/llvm-project
[libomptarget] Fix devicertl build
[libomptarget] Fix devicertl build The target specific functions in target_interface are extern C, but the implementations for nvptx were mostly C++ mangling. That worked out as a quirk of DEVICE macro expanding to nothing, except for shuffle.h which only forward declared the functions with C++ linkage. Also implements GetWarpSize, as used by shuffle, and includes target_interface in nvptx target_impl.cu to help catch future divergence between interface and implementation. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D98651
This commit is contained in:
parent
9c486eb348
commit
bcb3f0f867
|
@ -130,6 +130,7 @@ EXTERN int GetNumberOfThreadsInBlock() {
|
||||||
}
|
}
|
||||||
|
|
||||||
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||||
|
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||||
EXTERN unsigned GetLaneId() {
|
EXTERN unsigned GetLaneId() {
|
||||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,10 +33,12 @@ int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||||
/// Forward declarations
|
/// Forward declarations
|
||||||
///
|
///
|
||||||
///{
|
///{
|
||||||
|
extern "C" {
|
||||||
unsigned GetLaneId();
|
unsigned GetLaneId();
|
||||||
unsigned GetWarpSize();
|
unsigned GetWarpSize();
|
||||||
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||||
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||||
|
}
|
||||||
///}
|
///}
|
||||||
|
|
||||||
/// Fallback implementations of the shuffle sync idiom.
|
/// Fallback implementations of the shuffle sync idiom.
|
||||||
|
|
|
@ -13,64 +13,65 @@
|
||||||
|
|
||||||
#include "common/debug.h"
|
#include "common/debug.h"
|
||||||
#include "target_impl.h"
|
#include "target_impl.h"
|
||||||
|
#include "target_interface.h"
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||||
uint64_t val;
|
uint64_t val;
|
||||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||||
__kmpc_impl_lanemask_t res;
|
__kmpc_impl_lanemask_t res;
|
||||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
|
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||||
__kmpc_impl_lanemask_t res;
|
__kmpc_impl_lanemask_t res;
|
||||||
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
|
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE uint32_t __kmpc_impl_smid() {
|
EXTERN uint32_t __kmpc_impl_smid() {
|
||||||
uint32_t id;
|
uint32_t id;
|
||||||
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE double __kmpc_impl_get_wtick() {
|
EXTERN double __kmpc_impl_get_wtick() {
|
||||||
// Timer precision is 1ns
|
// Timer precision is 1ns
|
||||||
return ((double)1E-9);
|
return ((double)1E-9);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE double __kmpc_impl_get_wtime() {
|
EXTERN double __kmpc_impl_get_wtime() {
|
||||||
unsigned long long nsecs;
|
unsigned long long nsecs;
|
||||||
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
|
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
|
||||||
return (double)nsecs * __kmpc_impl_get_wtick();
|
return (double)nsecs * __kmpc_impl_get_wtick();
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||||
unsigned int Mask;
|
unsigned int Mask;
|
||||||
asm volatile("activemask.b32 %0;" : "=r"(Mask));
|
asm volatile("activemask.b32 %0;" : "=r"(Mask));
|
||||||
return Mask;
|
return Mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
|
EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); }
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
||||||
__nvvm_bar_warp_sync(Mask);
|
__nvvm_bar_warp_sync(Mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
// NVPTX specific kernel initialization
|
// NVPTX specific kernel initialization
|
||||||
DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
|
EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
|
||||||
}
|
}
|
||||||
|
|
||||||
// Barrier until num_threads arrive.
|
// Barrier until num_threads arrive.
|
||||||
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
|
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||||
// The named barrier for active parallel threads of a team in an L1 parallel
|
// The named barrier for active parallel threads of a team in an L1 parallel
|
||||||
// region to synchronize with each other.
|
// region to synchronize with each other.
|
||||||
int barrier = 1;
|
int barrier = 1;
|
||||||
|
@ -80,19 +81,20 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||||
: "memory");
|
: "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
|
EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
|
||||||
DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
|
EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
|
||||||
DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
|
EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
|
||||||
|
|
||||||
// Calls to the NVPTX layer (assuming 1D layout)
|
// Calls to the NVPTX layer (assuming 1D layout)
|
||||||
DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
|
EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
|
||||||
DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
|
EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
|
||||||
DEVICE int GetNumberOfBlocksInKernel() {
|
EXTERN int GetNumberOfBlocksInKernel() {
|
||||||
return __nvvm_read_ptx_sreg_nctaid_x();
|
return __nvvm_read_ptx_sreg_nctaid_x();
|
||||||
}
|
}
|
||||||
DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
|
EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
|
||||||
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
|
||||||
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
|
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||||
|
EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
|
||||||
|
|
||||||
// Atomics
|
// Atomics
|
||||||
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
|
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
|
||||||
|
@ -135,15 +137,15 @@ DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
|
||||||
#define UNSET 0u
|
#define UNSET 0u
|
||||||
#define SET 1u
|
#define SET 1u
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
|
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
|
||||||
__kmpc_impl_unset_lock(lock);
|
__kmpc_impl_unset_lock(lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
|
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
|
||||||
__kmpc_impl_unset_lock(lock);
|
__kmpc_impl_unset_lock(lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
||||||
// TODO: not sure spinning is a good idea here..
|
// TODO: not sure spinning is a good idea here..
|
||||||
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
|
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
|
||||||
int32_t start = __nvvm_read_ptx_sreg_clock();
|
int32_t start = __nvvm_read_ptx_sreg_clock();
|
||||||
|
@ -158,15 +160,15 @@ DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
||||||
} // wait for 0 to be the read value
|
} // wait for 0 to be the read value
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
|
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
|
||||||
(void)__kmpc_atomic_exchange(lock, UNSET);
|
(void)__kmpc_atomic_exchange(lock, UNSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
||||||
return __kmpc_atomic_add(lock, 0u);
|
return __kmpc_atomic_add(lock, 0u);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
|
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
|
||||||
DEVICE void __kmpc_impl_free(void *x) { free(x); }
|
EXTERN void __kmpc_impl_free(void *x) { free(x); }
|
||||||
|
|
||||||
#pragma omp end declare target
|
#pragma omp end declare target
|
||||||
|
|
|
@ -21,19 +21,20 @@ EXTERN int GetBlockIdInKernel();
|
||||||
EXTERN int GetNumberOfBlocksInKernel();
|
EXTERN int GetNumberOfBlocksInKernel();
|
||||||
EXTERN int GetNumberOfThreadsInBlock();
|
EXTERN int GetNumberOfThreadsInBlock();
|
||||||
EXTERN unsigned GetWarpId();
|
EXTERN unsigned GetWarpId();
|
||||||
|
EXTERN unsigned GetWarpSize();
|
||||||
EXTERN unsigned GetLaneId();
|
EXTERN unsigned GetLaneId();
|
||||||
|
|
||||||
// Atomics
|
// Atomics
|
||||||
extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
|
DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
|
||||||
extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
|
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
|
||||||
extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
|
DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
|
||||||
extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
|
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
|
||||||
extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
|
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
|
||||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||||
extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
|
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
|
||||||
unsigned long long);
|
unsigned long long);
|
||||||
extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
|
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
|
||||||
unsigned long long);
|
unsigned long long);
|
||||||
|
|
||||||
// Locks
|
// Locks
|
||||||
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
|
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
|
||||||
|
|
Loading…
Reference in New Issue