[OpenMP][FIX] Do not signal SPMD-mode but then keep generic-mode

If we assume SPMD-mode during the fixpoint iteration we have to execute
the kernel in SPMD-mode. If we change our mind during manifest there is
the chance of a mismatch between the simplification, e.g., of
`__kmpc_is_spmd_exec_mode` calls, and the execution mode. This problem
was introduced in D109438.

This patch is compromise to resolve the problem purely in OpenMP-opt
while trying to keep the benefits of D109438 around. This might not
always work, see `get_hardware_num_threads_in_block_fold` but it often
does. At the same time we do keep value specialization and execution
mode in sync.

Proper solutions to this problem should be considered. I believe a new
execution mode is the easiest way forward (Singleton-SPMD).
Alternatively, SPMD-mode execution can be used with a way to provide a
new thread_limit (here 1) to the runtime. This is more general and could
be useful if we see `num_threads` clauses or workshared loops with small
trip counts in the kernel. In either proposal we need to disable the
guarding for the kernel (which was the motivation for D109438).

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D112894
This commit is contained in:
Johannes Doerfert 2021-10-31 14:22:50 -05:00
parent 73720c8059
commit d61aac76bf
3 changed files with 64 additions and 24 deletions

View File

@ -597,6 +597,10 @@ struct KernelInfoState : AbstractState {
/// See AbstractState::indicateOptimisticFixpoint(...) /// See AbstractState::indicateOptimisticFixpoint(...)
ChangeStatus indicateOptimisticFixpoint() override { ChangeStatus indicateOptimisticFixpoint() override {
IsAtFixpoint = true; IsAtFixpoint = true;
ReachingKernelEntries.indicateOptimisticFixpoint();
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
ReachedKnownParallelRegions.indicateOptimisticFixpoint();
ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
return ChangeStatus::UNCHANGED; return ChangeStatus::UNCHANGED;
} }
@ -3058,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!KernelInitCB || !KernelDeinitCB) if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED; return ChangeStatus::UNCHANGED;
// Known SPMD-mode kernels need no manifest changes.
if (SPMDCompatibilityTracker.isKnown())
return ChangeStatus::UNCHANGED;
// If we can we change the execution mode to SPMD-mode otherwise we build a // If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine. // custom state machine.
if (!mayContainParallelRegion() || !changeToSPMDMode(A)) ChangeStatus Changed = ChangeStatus::UNCHANGED;
if (!changeToSPMDMode(A, Changed))
return buildCustomStateMachine(A); return buildCustomStateMachine(A);
return ChangeStatus::CHANGED; return Changed;
} }
bool changeToSPMDMode(Attributor &A) { bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
if (!SPMDCompatibilityTracker.isAssumed()) { if (!SPMDCompatibilityTracker.isAssumed()) {
@ -3102,6 +3103,24 @@ struct AAKernelInfoFunction : AAKernelInfo {
return false; return false;
} }
// Check if the kernel is already in SPMD mode, if so, return success.
Function *Kernel = getAnchorScope();
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
(Kernel->getName() + "_exec_mode").str());
assert(ExecMode && "Kernel without exec mode?");
assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
// Set the global exec mode flag to indicate SPMD-Generic mode.
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
"ExecMode is not an integer!");
const int8_t ExecModeVal =
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
return true;
// We will now unconditionally modify the IR, indicate a change.
Changed = ChangeStatus::CHANGED;
auto CreateGuardedRegion = [&](Instruction *RegionStartI, auto CreateGuardedRegion = [&](Instruction *RegionStartI,
Instruction *RegionEndI) { Instruction *RegionEndI) {
LoopInfo *LI = nullptr; LoopInfo *LI = nullptr;
@ -3312,17 +3331,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
// Adjust the global exec mode flag that tells the runtime what mode this // Adjust the global exec mode flag that tells the runtime what mode this
// kernel is executed in. // kernel is executed in.
Function *Kernel = getAnchorScope();
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
(Kernel->getName() + "_exec_mode").str());
assert(ExecMode && "Kernel without exec mode?");
assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
// Set the global exec mode flag to indicate SPMD-Generic mode.
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
"ExecMode is not an integer!");
const int8_t ExecModeVal =
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
"Initially non-SPMD kernel has SPMD exec mode!"); "Initially non-SPMD kernel has SPMD exec mode!");
ExecMode->setInitializer( ExecMode->setInitializer(
@ -3699,6 +3707,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
} }
// Callback to check a call instruction. // Callback to check a call instruction.
bool AllParallelRegionStatesWereFixed = true;
bool AllSPMDStatesWereFixed = true; bool AllSPMDStatesWereFixed = true;
auto CheckCallInst = [&](Instruction &I) { auto CheckCallInst = [&](Instruction &I) {
auto &CB = cast<CallBase>(I); auto &CB = cast<CallBase>(I);
@ -3706,6 +3715,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
getState() ^= CBAA.getState(); getState() ^= CBAA.getState();
AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
AllParallelRegionStatesWereFixed &=
CBAA.ReachedKnownParallelRegions.isAtFixpoint();
AllParallelRegionStatesWereFixed &=
CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
return true; return true;
}; };
@ -3717,6 +3730,23 @@ struct AAKernelInfoFunction : AAKernelInfo {
return indicatePessimisticFixpoint(); return indicatePessimisticFixpoint();
} }
// If we haven't used any assumed information for the reached parallel
// region states we can fix it.
if (!UsedAssumedInformationInCheckCallInst &&
AllParallelRegionStatesWereFixed) {
ReachedKnownParallelRegions.indicateOptimisticFixpoint();
ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
}
// If we are sure there are no parallel regions in the kernel we do not
// want SPMD mode.
if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
ReachedKnownParallelRegions.isAtFixpoint() &&
ReachedUnknownParallelRegions.isValidState() &&
ReachedKnownParallelRegions.isValidState() &&
!mayContainParallelRegion())
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
// If we haven't used any assumed information for the SPMD state we can fix // If we haven't used any assumed information for the SPMD state we can fix
// it. // it.
if (!UsedAssumedInformationInCheckRWInst && if (!UsedAssumedInformationInCheckRWInst &&

View File

@ -6,6 +6,7 @@
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
@__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1 @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata" @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata"
@G = external global i8
; Function Attrs: convergent norecurse nounwind ; Function Attrs: convergent norecurse nounwind
define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 { define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
@ -16,6 +17,7 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry: ; CHECK: user_code.entry:
; CHECK-NEXT: store i8 0, i8* @G, align 1
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; CHECK: worker.exit: ; CHECK: worker.exit:
@ -27,6 +29,12 @@ entry:
br i1 %exec_user_code, label %user_code.entry, label %worker.exit br i1 %exec_user_code, label %user_code.entry, label %worker.exit
user_code.entry: ; preds = %entry user_code.entry: ; preds = %entry
; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want
; generic execution.
; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime
; to only spawn a single thread. Further, we then should not guard any code.
%isSPMD = call i8 @__kmpc_is_spmd_exec_mode()
store i8 %isSPMD, i8* @G
call void @bar() #2 call void @bar() #2
call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
ret void ret void
@ -35,6 +43,8 @@ worker.exit: ; preds = %entry
ret void ret void
} }
declare i8 @__kmpc_is_spmd_exec_mode()
declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1)
declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1)

View File

@ -8,9 +8,9 @@ target triple = "nvptx64"
@G = external global i32 @G = external global i32
;. ;.
; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
@ -25,11 +25,11 @@ define weak void @kernel0() #0 {
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
call void @helper0() call void @helper0()
call void @helper1() call void @helper1()
call void @helper2() call void @helper2()
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
ret void ret void
} }
@ -43,9 +43,9 @@ define weak void @kernel1() #0 {
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
%i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
call void @helper1() call void @helper1()
call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
ret void ret void
} }