forked from OSchip/llvm-project
[OpenMP] Fix main thread barrier for Pascal and amdgpu
Fixes what's left of https://bugs.llvm.org/show_bug.cgi?id=51781. Reviewed By: jdoerfert, JonChesterfield, tianshilei1992 Differential Revision: https://reviews.llvm.org/D113602
This commit is contained in:
parent
30ebdf8a6d
commit
c9dfe322ee
|
@ -3456,7 +3456,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
// Create all the blocks:
|
||||
//
|
||||
// InitCB = __kmpc_target_init(...)
|
||||
// bool IsWorker = InitCB >= 0;
|
||||
// BlockHwSize =
|
||||
// __kmpc_get_hardware_num_threads_in_block();
|
||||
// WarpSize = __kmpc_get_warp_size();
|
||||
// BlockSize = BlockHwSize - WarpSize;
|
||||
// if (InitCB >= BlockSize) return;
|
||||
// IsWorkerCheckBB: bool IsWorker = InitCB >= 0;
|
||||
// if (IsWorker) {
|
||||
// SMBeginBB: __kmpc_barrier_simple_generic(...);
|
||||
// void *WorkFn;
|
||||
|
@ -3484,6 +3489,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
BasicBlock *InitBB = KernelInitCB->getParent();
|
||||
BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
|
||||
KernelInitCB->getNextNode(), "thread.user_code.check");
|
||||
BasicBlock *IsWorkerCheckBB =
|
||||
BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
|
||||
BasicBlock *StateMachineBeginBB = BasicBlock::Create(
|
||||
Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
|
||||
BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
|
||||
|
@ -3500,6 +3507,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
|
||||
A.registerManifestAddedBasicBlock(*InitBB);
|
||||
A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
|
||||
A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
|
||||
A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
|
||||
A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
|
||||
A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
|
||||
|
@ -3509,16 +3517,38 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
|
||||
const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
|
||||
ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
|
||||
|
||||
InitBB->getTerminator()->eraseFromParent();
|
||||
|
||||
Module &M = *Kernel->getParent();
|
||||
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
||||
FunctionCallee BlockHwSizeFn =
|
||||
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
|
||||
FunctionCallee WarpSizeFn =
|
||||
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, OMPRTL___kmpc_get_warp_size);
|
||||
Instruction *BlockHwSize =
|
||||
CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
|
||||
BlockHwSize->setDebugLoc(DLoc);
|
||||
Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
|
||||
WarpSize->setDebugLoc(DLoc);
|
||||
Instruction *BlockSize =
|
||||
BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
|
||||
BlockSize->setDebugLoc(DLoc);
|
||||
Instruction *IsMainOrWorker =
|
||||
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
|
||||
BlockSize, "thread.is_main_or_worker", InitBB);
|
||||
IsMainOrWorker->setDebugLoc(DLoc);
|
||||
BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
|
||||
InitBB);
|
||||
|
||||
Instruction *IsWorker =
|
||||
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
|
||||
ConstantInt::get(KernelInitCB->getType(), -1),
|
||||
"thread.is_worker", InitBB);
|
||||
"thread.is_worker", IsWorkerCheckBB);
|
||||
IsWorker->setDebugLoc(DLoc);
|
||||
BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
|
||||
|
||||
Module &M = *Kernel->getParent();
|
||||
BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
|
||||
IsWorkerCheckBB);
|
||||
|
||||
// Create local storage for the work function pointer.
|
||||
const DataLayout &DL = M.getDataLayout();
|
||||
|
@ -3528,7 +3558,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
"worker.work_fn.addr", &Kernel->getEntryBlock().front());
|
||||
WorkFnAI->setDebugLoc(DLoc);
|
||||
|
||||
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
||||
OMPInfoCache.OMPBuilder.updateToLocation(
|
||||
OpenMPIRBuilder::LocationDescription(
|
||||
IRBuilder<>::InsertPoint(StateMachineBeginBB,
|
||||
|
|
|
@ -912,6 +912,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -921,7 +927,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1037,6 +1043,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1046,7 +1058,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1185,6 +1197,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1194,7 +1212,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1311,6 +1329,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1320,7 +1344,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1435,6 +1459,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1444,7 +1474,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1559,6 +1589,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1568,7 +1604,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1659,6 +1695,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -1668,7 +1710,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -1882,6 +1924,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -1890,7 +1938,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2006,6 +2054,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2014,7 +2068,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2153,6 +2207,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2161,7 +2221,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2278,6 +2338,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2286,7 +2352,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2401,6 +2467,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2409,7 +2481,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2524,6 +2596,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2532,7 +2610,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2623,6 +2701,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2631,7 +2715,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
|
|
@ -184,6 +184,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 {
|
|||
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU-DISABLED: is_worker_check:
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.begin:
|
||||
|
@ -193,7 +199,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 {
|
|||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.finished:
|
||||
; AMDGPU-DISABLED-NEXT: ret void
|
||||
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -231,6 +237,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 {
|
|||
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX-DISABLED: is_worker_check:
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.begin:
|
||||
|
@ -239,7 +251,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 {
|
|||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.finished:
|
||||
; NVPTX-DISABLED-NEXT: ret void
|
||||
; NVPTX-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -636,6 +648,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20()
|
|||
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU-DISABLED: is_worker_check:
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.begin:
|
||||
|
@ -645,7 +663,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20()
|
|||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.finished:
|
||||
; AMDGPU-DISABLED-NEXT: ret void
|
||||
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -685,6 +703,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20()
|
|||
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX-DISABLED: is_worker_check:
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.begin:
|
||||
|
@ -693,7 +717,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20()
|
|||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.finished:
|
||||
; NVPTX-DISABLED-NEXT: ret void
|
||||
; NVPTX-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -1073,6 +1097,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35(
|
|||
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU-DISABLED: is_worker_check:
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.begin:
|
||||
|
@ -1082,7 +1112,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35(
|
|||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.finished:
|
||||
; AMDGPU-DISABLED-NEXT: ret void
|
||||
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -1122,6 +1152,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35(
|
|||
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX-DISABLED: is_worker_check:
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.begin:
|
||||
|
@ -1130,7 +1166,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35(
|
|||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.finished:
|
||||
; NVPTX-DISABLED-NEXT: ret void
|
||||
; NVPTX-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -1550,6 +1586,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guar
|
|||
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU-DISABLED: is_worker_check:
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.begin:
|
||||
|
@ -1559,7 +1601,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guar
|
|||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.finished:
|
||||
; AMDGPU-DISABLED-NEXT: ret void
|
||||
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -1599,6 +1641,12 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guar
|
|||
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX-DISABLED: is_worker_check:
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.begin:
|
||||
|
@ -1607,7 +1655,7 @@ define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guar
|
|||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.finished:
|
||||
; NVPTX-DISABLED-NEXT: ret void
|
||||
; NVPTX-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -2027,6 +2075,12 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU: is_worker_check:
|
||||
; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.begin:
|
||||
|
@ -2036,7 +2090,7 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU: worker_state_machine.finished:
|
||||
; AMDGPU-NEXT: ret void
|
||||
; AMDGPU: worker_state_machine.is_active.check:
|
||||
|
@ -2068,6 +2122,12 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX: is_worker_check:
|
||||
; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.begin:
|
||||
|
@ -2076,7 +2136,7 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX: worker_state_machine.finished:
|
||||
; NVPTX-NEXT: ret void
|
||||
; NVPTX: worker_state_machine.is_active.check:
|
||||
|
@ -2109,6 +2169,12 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; AMDGPU-DISABLED: is_worker_check:
|
||||
; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.begin:
|
||||
|
@ -2118,7 +2184,7 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; AMDGPU-DISABLED: worker_state_machine.finished:
|
||||
; AMDGPU-DISABLED-NEXT: ret void
|
||||
; AMDGPU-DISABLED: worker_state_machine.is_active.check:
|
||||
|
@ -2151,6 +2217,12 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
|
||||
; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
|
||||
; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; NVPTX-DISABLED: is_worker_check:
|
||||
; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.begin:
|
||||
|
@ -2159,7 +2231,7 @@ define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 {
|
|||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; NVPTX-DISABLED: worker_state_machine.finished:
|
||||
; NVPTX-DISABLED-NEXT: ret void
|
||||
; NVPTX-DISABLED: worker_state_machine.is_active.check:
|
||||
|
|
|
@ -177,6 +177,12 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(i32* %x, i64 %N)
|
|||
; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
|
||||
; CHECK-DISABLED-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32
|
||||
; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 1, i1 false, i1 true) #[[ATTR4:[0-9]+]]
|
||||
; CHECK-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
; CHECK-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
; CHECK-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
|
||||
; CHECK-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
|
||||
; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
|
||||
; CHECK-DISABLED: is_worker_check:
|
||||
; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
|
||||
; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
|
||||
; CHECK-DISABLED: worker_state_machine.begin:
|
||||
|
@ -185,7 +191,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(i32* %x, i64 %N)
|
|||
; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
|
||||
; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
|
||||
; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
|
||||
; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
|
||||
; CHECK-DISABLED: worker_state_machine.finished:
|
||||
; CHECK-DISABLED-NEXT: ret void
|
||||
; CHECK-DISABLED: worker_state_machine.is_active.check:
|
||||
|
|
|
@ -86,7 +86,21 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
|
|||
if (mapping::isInitialThreadInLevel0(IsSPMD))
|
||||
return -1;
|
||||
|
||||
if (UseGenericStateMachine)
|
||||
// Enter the generic state machine if enabled and if this thread can possibly
|
||||
// be an active worker thread.
|
||||
//
|
||||
// The latter check is important for NVIDIA Pascal (but not Volta) and AMD
|
||||
// GPU. In those cases, a single thread can apparently satisfy a barrier on
|
||||
// behalf of all threads in the same warp. Thus, it would not be safe for
|
||||
// other threads in the main thread's warp to reach the first
|
||||
// synchronize::threads call in genericStateMachine before the main thread
|
||||
// reaches its corresponding synchronize::threads call: that would permit all
|
||||
// active worker threads to proceed before the main thread has actually set
|
||||
// state::ParallelRegionFn, and then they would immediately quit without
|
||||
// doing any work. mapping::getBlockSize() does not include any of the main
|
||||
// thread's warp, so none of its threads can ever be active worker threads.
|
||||
if (UseGenericStateMachine &&
|
||||
mapping::getThreadIdInBlock() < mapping::getBlockSize())
|
||||
genericStateMachine(Ident);
|
||||
|
||||
return mapping::getThreadIdInBlock();
|
||||
|
|
|
@ -225,7 +225,22 @@ int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
|
|||
if (TId == GetMasterThreadID())
|
||||
return -1;
|
||||
|
||||
if (UseGenericStateMachine)
|
||||
// Enter the generic state machine if enabled and if this thread can possibly
|
||||
// be an active worker thread.
|
||||
//
|
||||
// The latter check is important for NVIDIA Pascal (but not Volta) and AMD
|
||||
// GPU. In those cases, a single thread can apparently satisfy a barrier on
|
||||
// behalf of all threads in the same warp. Thus, it would not be safe for
|
||||
// other threads in the main thread's warp to reach the first
|
||||
// __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
|
||||
// before the main thread reaches its corresponding
|
||||
// __kmpc_barrier_simple_spmd call: that would permit all active worker
|
||||
// threads to proceed before the main thread has actually set
|
||||
// omptarget_nvptx_workFn, and then they would immediately quit without
|
||||
// doing any work. GetNumberOfWorkersInTeam() does not include any of the
|
||||
// main thread's warp, so none of its threads can ever be active worker
|
||||
// threads.
|
||||
if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
|
||||
__kmpc_target_region_state_machine(Ident);
|
||||
|
||||
return TId;
|
||||
|
|
|
@ -231,6 +231,7 @@ namespace _OMP {
|
|||
__attribute__((used, weak, optnone)) void keepAlive() {
|
||||
__kmpc_get_hardware_thread_id_in_block();
|
||||
__kmpc_get_hardware_num_threads_in_block();
|
||||
__kmpc_get_warp_size();
|
||||
__kmpc_barrier_simple_spmd(nullptr, 0);
|
||||
__kmpc_barrier_simple_generic(nullptr, 0);
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ EXTERN int __kmpc_get_hardware_thread_id_in_block();
|
|||
EXTERN int GetBlockIdInKernel();
|
||||
EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
|
||||
EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
|
||||
EXTERN unsigned __kmpc_get_warp_size();
|
||||
EXTERN unsigned GetWarpId();
|
||||
EXTERN unsigned GetWarpSize();
|
||||
EXTERN unsigned GetLaneId();
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
// Use the generic state machine. On some architectures, other threads in the
|
||||
// main thread's warp must avoid barrier instructions.
|
||||
//
|
||||
// RUN: %libomptarget-compile-run-and-check-generic
|
||||
|
||||
// SPMDize. There is no main thread, so there's no issue.
|
||||
//
|
||||
// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1
|
||||
// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd
|
||||
// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd
|
||||
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
|
||||
//
|
||||
// SPMD: Transformed generic-mode kernel to SPMD-mode.
|
||||
|
||||
// Use the custom state machine, which must avoid the same barrier problem as
|
||||
// the generic state machine.
|
||||
//
|
||||
// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \
|
||||
// RUN: -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
|
||||
// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
|
||||
// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
|
||||
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
|
||||
//
|
||||
// CUSTOM: Rewriting generic-mode kernel with a customized state machine.
|
||||
|
||||
#include <stdio.h>
|
||||
int main() {
|
||||
int x = 0, y = 1;
|
||||
#pragma omp target teams num_teams(1) map(tofrom:x, y)
|
||||
{
|
||||
x = 5;
|
||||
#pragma omp parallel
|
||||
y = 6;
|
||||
}
|
||||
// CHECK: 5, 6
|
||||
printf("%d, %d\n", x, y);
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue