forked from OSchip/llvm-project
[OpenMP][FIX] Do not signal SPMD-mode but then keep generic-mode
If we assume SPMD-mode during the fixpoint iteration we have to execute the kernel in SPMD-mode. If we change our mind during manifest there is the chance of a mismatch between the simplification, e.g., of `__kmpc_is_spmd_exec_mode` calls, and the execution mode. This problem was introduced in D109438. This patch is compromise to resolve the problem purely in OpenMP-opt while trying to keep the benefits of D109438 around. This might not always work, see `get_hardware_num_threads_in_block_fold` but it often does. At the same time we do keep value specialization and execution mode in sync. Proper solutions to this problem should be considered. I believe a new execution mode is the easiest way forward (Singleton-SPMD). Alternatively, SPMD-mode execution can be used with a way to provide a new thread_limit (here 1) to the runtime. This is more general and could be useful if we see `num_threads` clauses or workshared loops with small trip counts in the kernel. In either proposal we need to disable the guarding for the kernel (which was the motivation for D109438). Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D112894
This commit is contained in:
parent
73720c8059
commit
d61aac76bf
|
@ -597,6 +597,10 @@ struct KernelInfoState : AbstractState {
|
|||
/// See AbstractState::indicateOptimisticFixpoint(...)
|
||||
ChangeStatus indicateOptimisticFixpoint() override {
|
||||
IsAtFixpoint = true;
|
||||
ReachingKernelEntries.indicateOptimisticFixpoint();
|
||||
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
|
||||
ReachedKnownParallelRegions.indicateOptimisticFixpoint();
|
||||
ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
|
||||
return ChangeStatus::UNCHANGED;
|
||||
}
|
||||
|
||||
|
@ -3058,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
if (!KernelInitCB || !KernelDeinitCB)
|
||||
return ChangeStatus::UNCHANGED;
|
||||
|
||||
// Known SPMD-mode kernels need no manifest changes.
|
||||
if (SPMDCompatibilityTracker.isKnown())
|
||||
return ChangeStatus::UNCHANGED;
|
||||
|
||||
// If we can we change the execution mode to SPMD-mode otherwise we build a
|
||||
// custom state machine.
|
||||
if (!mayContainParallelRegion() || !changeToSPMDMode(A))
|
||||
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
||||
if (!changeToSPMDMode(A, Changed))
|
||||
return buildCustomStateMachine(A);
|
||||
|
||||
return ChangeStatus::CHANGED;
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool changeToSPMDMode(Attributor &A) {
|
||||
bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
|
||||
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
||||
|
||||
if (!SPMDCompatibilityTracker.isAssumed()) {
|
||||
|
@ -3102,6 +3103,24 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Check if the kernel is already in SPMD mode, if so, return success.
|
||||
Function *Kernel = getAnchorScope();
|
||||
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
|
||||
(Kernel->getName() + "_exec_mode").str());
|
||||
assert(ExecMode && "Kernel without exec mode?");
|
||||
assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
|
||||
|
||||
// Set the global exec mode flag to indicate SPMD-Generic mode.
|
||||
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
|
||||
"ExecMode is not an integer!");
|
||||
const int8_t ExecModeVal =
|
||||
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
|
||||
if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
|
||||
return true;
|
||||
|
||||
// We will now unconditionally modify the IR, indicate a change.
|
||||
Changed = ChangeStatus::CHANGED;
|
||||
|
||||
auto CreateGuardedRegion = [&](Instruction *RegionStartI,
|
||||
Instruction *RegionEndI) {
|
||||
LoopInfo *LI = nullptr;
|
||||
|
@ -3312,17 +3331,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
|
||||
// Adjust the global exec mode flag that tells the runtime what mode this
|
||||
// kernel is executed in.
|
||||
Function *Kernel = getAnchorScope();
|
||||
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
|
||||
(Kernel->getName() + "_exec_mode").str());
|
||||
assert(ExecMode && "Kernel without exec mode?");
|
||||
assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
|
||||
|
||||
// Set the global exec mode flag to indicate SPMD-Generic mode.
|
||||
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
|
||||
"ExecMode is not an integer!");
|
||||
const int8_t ExecModeVal =
|
||||
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
|
||||
assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
|
||||
"Initially non-SPMD kernel has SPMD exec mode!");
|
||||
ExecMode->setInitializer(
|
||||
|
@ -3699,6 +3707,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
}
|
||||
|
||||
// Callback to check a call instruction.
|
||||
bool AllParallelRegionStatesWereFixed = true;
|
||||
bool AllSPMDStatesWereFixed = true;
|
||||
auto CheckCallInst = [&](Instruction &I) {
|
||||
auto &CB = cast<CallBase>(I);
|
||||
|
@ -3706,6 +3715,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
|
||||
getState() ^= CBAA.getState();
|
||||
AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
|
||||
AllParallelRegionStatesWereFixed &=
|
||||
CBAA.ReachedKnownParallelRegions.isAtFixpoint();
|
||||
AllParallelRegionStatesWereFixed &=
|
||||
CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@ -3717,6 +3730,23 @@ struct AAKernelInfoFunction : AAKernelInfo {
|
|||
return indicatePessimisticFixpoint();
|
||||
}
|
||||
|
||||
// If we haven't used any assumed information for the reached parallel
|
||||
// region states we can fix it.
|
||||
if (!UsedAssumedInformationInCheckCallInst &&
|
||||
AllParallelRegionStatesWereFixed) {
|
||||
ReachedKnownParallelRegions.indicateOptimisticFixpoint();
|
||||
ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
|
||||
}
|
||||
|
||||
// If we are sure there are no parallel regions in the kernel we do not
|
||||
// want SPMD mode.
|
||||
if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
|
||||
ReachedKnownParallelRegions.isAtFixpoint() &&
|
||||
ReachedUnknownParallelRegions.isValidState() &&
|
||||
ReachedKnownParallelRegions.isValidState() &&
|
||||
!mayContainParallelRegion())
|
||||
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
|
||||
|
||||
// If we haven't used any assumed information for the SPMD state we can fix
|
||||
// it.
|
||||
if (!UsedAssumedInformationInCheckRWInst &&
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
|
||||
@__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1
|
||||
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata"
|
||||
@G = external global i8
|
||||
|
||||
; Function Attrs: convergent norecurse nounwind
|
||||
define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
|
||||
|
@ -16,6 +17,7 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
|
|||
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
|
||||
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
|
||||
; CHECK: user_code.entry:
|
||||
; CHECK-NEXT: store i8 0, i8* @G, align 1
|
||||
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
|
||||
; CHECK-NEXT: ret void
|
||||
; CHECK: worker.exit:
|
||||
|
@ -27,6 +29,12 @@ entry:
|
|||
br i1 %exec_user_code, label %user_code.entry, label %worker.exit
|
||||
|
||||
user_code.entry: ; preds = %entry
|
||||
; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want
|
||||
; generic execution.
|
||||
; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime
|
||||
; to only spawn a single thread. Further, we then should not guard any code.
|
||||
%isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
|
||||
store i8 %isSPMD, i8* @G
|
||||
call void @bar() #2
|
||||
call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
|
||||
ret void
|
||||
|
@ -35,6 +43,8 @@ worker.exit: ; preds = %entry
|
|||
ret void
|
||||
}
|
||||
|
||||
declare i8 @__kmpc_is_spmd_exec_mode()
|
||||
|
||||
declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1)
|
||||
|
||||
declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1)
|
||||
|
|
|
@ -8,9 +8,9 @@ target triple = "nvptx64"
|
|||
|
||||
@G = external global i32
|
||||
;.
|
||||
; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
|
||||
; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
|
||||
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
|
||||
; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
|
||||
; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
|
||||
; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
|
||||
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
|
||||
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
|
||||
|
@ -25,11 +25,11 @@ define weak void @kernel0() #0 {
|
|||
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
|
||||
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
|
||||
call void @helper0()
|
||||
call void @helper1()
|
||||
call void @helper2()
|
||||
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
|
||||
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -43,9 +43,9 @@ define weak void @kernel1() #0 {
|
|||
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
|
||||
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
|
||||
call void @helper1()
|
||||
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
|
||||
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue