[libomptarget][devicertl] Replace lanemask with uint64 at interface

Use uint64_t for lanemask on all GPU architectures at the interface
with clang. Updates tests. The deviceRTL is always linked as IR so the zext
and trunc introduced for wave32 architectures will fold after inlining.

Simplification partly motivated by amdgpu gfx10 which will be wave32 and
is awkward to express in the current arch-dependant typedef interface.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D108317
This commit is contained in:
Jon Chesterfield 2021-08-18 20:47:33 +01:00
parent cfb6dfcbd1
commit 21d91a8ef3
9 changed files with 26 additions and 40 deletions

View File

@ -485,7 +485,7 @@ int bar(int n){
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
// CHECK3-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@ -508,7 +508,7 @@ int bar(int n){
// CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK3: omp.critical.sync:
// CHECK3-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
// CHECK3-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]]
@ -938,7 +938,7 @@ int bar(int n){
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
// CHECK4-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@ -961,7 +961,7 @@ int bar(int n){
// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK4: omp.critical.sync:
// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
// CHECK4-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]]
@ -1391,7 +1391,7 @@ int bar(int n){
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
// CHECK5-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@ -1414,7 +1414,7 @@ int bar(int n){
// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK5: omp.critical.sync:
// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
// CHECK5-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]]
@ -1663,7 +1663,7 @@ int bar(int n){
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
// CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@ -1686,7 +1686,7 @@ int bar(int n){
// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK1: omp.critical.sync:
// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
// CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]]
@ -1935,7 +1935,7 @@ int bar(int n){
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
// CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@ -1958,7 +1958,7 @@ int bar(int n){
// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK2: omp.critical.sync:
// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
// CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]]

View File

@ -654,9 +654,6 @@ public:
omp::IdentFlag Flags = omp::IdentFlag(0),
unsigned Reserve2Flags = 0);
// Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
Type *getLanemaskType();
/// Generate control flow and cleanup for cancellation.
///
/// \param CancelFlag Flag indicating if the cancellation is performed.

View File

@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)
__OMP_TYPE(Int64Ptr)
OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
OMP_TYPE(LanemaskTy, getLanemaskType())
#define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
@ -443,8 +442,8 @@ __OMP_RTL(__kmpc_parallel_level, false, Int8, )
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
__OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)

View File

@ -261,14 +261,6 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
return Builder.CreatePointerCast(Ident, IdentPtr);
}
Type *OpenMPIRBuilder::getLanemaskType() {
LLVMContext &Ctx = M.getContext();
Triple triple(M.getTargetTriple());
// This test is adequate until deviceRTL has finer grained lane widths
return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx);
}
Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
Constant *&SrcLocStr = SrcLocStrMap[LocStr];
if (!SrcLocStr) {

View File

@ -626,9 +626,9 @@ declare void @__kmpc_destroy_allocator(i32, i8*)
declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
declare i32 @__kmpc_warp_active_thread_mask()
declare i64 @__kmpc_warp_active_thread_mask()
declare void @__kmpc_syncwarp(i32)
declare void @__kmpc_syncwarp(i64)
declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
@ -1149,10 +1149,10 @@ attributes #0 = { noinline cold }
; CHECK-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
; CHECK: ; Function Attrs: convergent nounwind
; CHECK-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
; CHECK-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
; CHECK: ; Function Attrs: convergent nounwind
; CHECK-NEXT: declare void @__kmpc_syncwarp(i32)
; CHECK-NEXT: declare void @__kmpc_syncwarp(i64)
; CHECK: ; Function Attrs: nounwind
; CHECK-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
@ -1677,10 +1677,10 @@ attributes #0 = { noinline cold }
; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
; OPTIMISTIC: ; Function Attrs: convergent nounwind
; OPTIMISTIC-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
; OPTIMISTIC-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
; OPTIMISTIC: ; Function Attrs: convergent nounwind
; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i32)
; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i64)
; OPTIMISTIC: ; Function Attrs: nounwind
; OPTIMISTIC-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)

View File

@ -247,9 +247,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId);
void __kmpc_flush(IdentTy *Loc);
__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
uint64_t __kmpc_warp_active_thread_mask(void);
void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
void __kmpc_syncwarp(uint64_t Mask);
void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);

View File

@ -286,11 +286,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
return mapping::activemask();
}
uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));

View File

@ -123,7 +123,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) {
// Vote
////////////////////////////////////////////////////////////////////////////////
EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
return __kmpc_impl_activemask();
}
@ -132,7 +132,7 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
// Syncwarp
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
EXTERN void __kmpc_syncwarp(uint64_t Mask) {
PRINT0(LD_IO, "call __kmpc_syncwarp\n");
__kmpc_impl_syncwarp(Mask);
}

View File

@ -375,9 +375,9 @@ EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
EXTERN void __kmpc_flush(kmp_Ident *loc);
// vote
EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
// syncwarp
EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t);
EXTERN void __kmpc_syncwarp(uint64_t);
// tasks
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,