[NewPM] Disable non-trivial loop-unswitch on targets with divergence

Unswitching a loop on a non-trivial divergent branch is expensive
since it serializes the execution of both version of the
loop. But identifying a divergent branch needs divergence analysis,
which is a function level analysis.

The legacy pass manager handles this dependency by isolating such a
loop transform and rerunning the required function analyses. This
functionality is currently missing in the new pass manager, and there
is no safe way for the SimpleLoopUnswitch pass to depend on
DivergenceAnalysis. So we conservatively assume that all non-trivial
branches are divergent if the target has divergence.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D98958
This commit is contained in:
Sameer Sahasrabuddhe 2021-03-25 11:27:10 +00:00
parent 1e56e8717f
commit b92c8c22b9
3 changed files with 68 additions and 48 deletions

View File

@ -2901,10 +2901,20 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
return true;
}
// If we're not doing non-trivial unswitching, we're done. We both accept
// a parameter but also check a local flag that can be used for testing
// a debugging.
if (!NonTrivial && !EnableNonTrivialUnswitch)
// Check whether we should continue with non-trivial conditions.
// EnableNonTrivialUnswitch: Global variable that forces non-trivial
// unswitching for testing and debugging.
// NonTrivial: Parameter that enables non-trivial unswitching for this
// invocation of the transform. But this should be allowed only
// for targets without branch divergence.
//
// FIXME: If divergence analysis becomes available to a loop
// transform, we should allow unswitching for non-trivial uniform
// branches even on targets that have divergence.
// https://bugs.llvm.org/show_bug.cgi?id=48819
bool ContinueWithNonTrivial =
EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence());
if (!ContinueWithNonTrivial)
return false;
// Skip non-trivial unswitching for optsize functions.

View File

@ -1,47 +1,4 @@
; RUN: opt -mtriple=amdgcn-- -O3 -S -enable-new-pm=0 %s | FileCheck %s
; This fails with the new pass manager:
; https://bugs.llvm.org/show_bug.cgi?id=48819
; Check that loop unswitch happened and condition hoisted out of the loop.
; Condition is uniform so all targets should perform unswitching.
; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
; CHECK: entry:
; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
; CHECK-NEXT: br i1
define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%cmp1 = icmp eq i32 %x, 123456
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.inc
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.inc, %for.body.lr.ph
%i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
store i32 %i.07, i32 * %arrayidx, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
; Check that loop unswitch does not happen if condition is divergent.

View File

@ -0,0 +1,53 @@
; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
; XFAIL: *
; Check that loop unswitch happened and condition hoisted out of the loop.
; Condition is uniform so even targets with divergence should perform unswitching.
; This fails with the new pass manager:
; https://bugs.llvm.org/show_bug.cgi?id=48819
; The correct behaviour (allow uniform non-trivial branches to be
; unswitched on all targets) requires access to the function-level
; divergence analysis from a loop transform, which is currently not
; supported in the new pass manager.
; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
; CHECK: entry:
; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
; CHECK-NEXT: br i1
define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%cmp1 = icmp eq i32 %x, 123456
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.inc
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.inc, %for.body.lr.ph
%i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
store i32 %i.07, i32 * %arrayidx, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }