AMDGPU: Add a pass to rewrite certain undef in PHI

For the pattern of IR (%if terminates with a divergent branch.), divergence analysis will report %phi as uniform to help optimal code generation. ``` %if | \ | %then | / %endif: %phi = phi [ %uniform, %if ], [ %undef, %then ] ``` In the backend, %phi and %uniform will be assigned a scalar register. But the %undef from %then will make the scalar register dead in %then. This will likely cause the register being over-written in %then. To fix the issue, we will rewrite %undef as %uniform. For details, please refer the comment in AMDGPURewriteUndefForPHI.cpp. Currently there is no test changes shown, but this is mandatory for later changes. Reviewed by: sameerds Differential Revision: https://reviews.llvm.org/D133840
2022-09-13 14:56:09 +08:00 · 2022-09-13 14:56:09 +08:00 · cf14c7caac
parent 7a8b9307ca
commit cf14c7caac
7 changed files with 361 additions and 0 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -290,6 +290,10 @@ extern char &AMDGPUCodeGenPrepareID;
 void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
 extern char &AMDGPULateCodeGenPrepareID;

+FunctionPass *createAMDGPURewriteUndefForPHIPass();
+void initializeAMDGPURewriteUndefForPHIPass(PassRegistry &);
+extern char &AMDGPURewriteUndefForPHIPassID;
+
 void initializeSIAnnotateControlFlowPass(PassRegistry&);
 extern char &SIAnnotateControlFlowPassID;

--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
@ -0,0 +1,181 @@
+//===- AMDGPURewriteUndefForPHI.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements the idea to rewrite undef incoming operand for certain
+// PHIs in structurized CFG. This pass only works on IR that has gone through
+// StructurizedCFG pass, and this pass has some additional limitation that make
+// it can only run after SIAnnotateControlFlow.
+//
+// To achieve optimal code generation for AMDGPU, we assume that divergence
+// analysis reports the PHI in join block of divergent branch as uniform if
+// it has one unique uniform value plus additional undefined/poisoned incoming
+// value. That is to say the later compiler pipeline will ensure such PHI always
+// return uniform value and ensure it work correctly. Let's take a look at two
+// typical patterns in structured CFG that need to be taken care: (In both
+// patterns, block %if terminate with divergent branch.)
+//
+// Pattern A: Block with undefined incoming value dominates defined predecessor
+//  %if
+//  | \
+//  | %then
+//  | /
+//  %endif: %phi = phi [%undef, %if], [%uniform, %then]
+//
+//  Pattern B: Block with defined incoming value dominates undefined predecessor
+//  %if
+//  | \
+//  | %then
+//  | /
+//  %endif: %phi = phi [%uniform, %if], [%undef, %then]
+//
+// For pattern A, by reporting %phi as uniform, the later pipeline need to make
+// sure it be handled correctly. The backend usually allocates a scalar register
+// and if any thread in a wave takes %then path, the scalar register will get
+// the %uniform value.
+//
+// For pattern B, we will replace the undef operand with the other defined value
+// in this pass. So the scalar register allocated for such PHI will get correct
+// liveness. Without this transformation, the scalar register may be overwritten
+// in the %then block.
+//
+// Limitation note:
+// If the join block of divergent threads is a loop header, the pass cannot
+// handle it correctly right now. For below case, the undef in %phi should also
+// be rewritten. Currently we depend on SIAnnotateControlFlow to split %header
+// block to get a separate join block, then we can rewrite the undef correctly.
+//     %if
+//     | \
+//     | %then
+//     | /
+// -> %header: %phi = phi [%uniform, %if], [%undef, %then], [%uniform2, %header]
+// |   |
+// \---
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-rewrite-undef-for-phi"
+
+namespace {
+
+class AMDGPURewriteUndefForPHI : public FunctionPass {
+public:
+  static char ID;
+  AMDGPURewriteUndefForPHI() : FunctionPass(ID) {
+    initializeAMDGPURewriteUndefForPHIPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  StringRef getPassName() const override {
+    return "AMDGPU Rewrite Undef for PHI";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LegacyDivergenceAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LegacyDivergenceAnalysis>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+char AMDGPURewriteUndefForPHI::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
+                      "Rewrite undef for PHI", false, false)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
+                    "Rewrite undef for PHI", false, false)
+
+bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) {
+  bool Changed = false;
+  SmallVector<PHINode *> ToBeDeleted;
+  for (auto &BB : F) {
+    for (auto &PHI : BB.phis()) {
+      if (DA->isDivergent(&PHI))
+        continue;
+
+      // The unique incoming value except undef/poison for the PHI node.
+      Value *UniqueDefinedIncoming = nullptr;
+      // The divergent block with defined incoming value that dominates all
+      // other block with the same incoming value.
+      BasicBlock *DominateBB = nullptr;
+      // Predecessors with undefined incoming value (excluding loop backedge).
+      SmallVector<BasicBlock *> Undefs;
+
+      for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+        Value *Incoming = PHI.getIncomingValue(i);
+        BasicBlock *IncomingBB = PHI.getIncomingBlock(i);
+
+        if (Incoming == &PHI)
+          continue;
+
+        if (isa<UndefValue>(Incoming)) {
+          // Undef from loop backedge will not be replaced.
+          if (!DT->dominates(&BB, IncomingBB))
+            Undefs.push_back(IncomingBB);
+          continue;
+        }
+
+        if (!UniqueDefinedIncoming) {
+          UniqueDefinedIncoming = Incoming;
+          DominateBB = IncomingBB;
+        } else if (Incoming == UniqueDefinedIncoming) {
+          // Update DominateBB if necessary.
+          if (DT->dominates(IncomingBB, DominateBB))
+            DominateBB = IncomingBB;
+        } else {
+          UniqueDefinedIncoming = nullptr;
+          break;
+        }
+      }
+      // We only need to replace the undef for the PHI which is merging
+      // defined/undefined values from divergent threads.
+      // TODO: We should still be able to replace undef value if the unique
+      // value is a Constant.
+      if (!UniqueDefinedIncoming || Undefs.empty() ||
+          !DA->isDivergent(DominateBB->getTerminator()))
+        continue;
+
+      // We only replace the undef when DominateBB truly dominates all the
+      // other predecessors with undefined incoming value. Make sure DominateBB
+      // dominates BB so that UniqueDefinedIncoming is available in BB and
+      // afterwards.
+      if (DT->dominates(DominateBB, &BB) && all_of(Undefs, [&](BasicBlock *UD) {
+            return DT->dominates(DominateBB, UD);
+          })) {
+        PHI.replaceAllUsesWith(UniqueDefinedIncoming);
+        ToBeDeleted.push_back(&PHI);
+        Changed = true;
+      }
+    }
+  }
+
+  for (auto *PHI : ToBeDeleted)
+    PHI->eraseFromParent();
+
+  return Changed;
+}
+
+bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) {
+  LegacyDivergenceAnalysis *DA = &getAnalysis<LegacyDivergenceAnalysis>();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return rewritePHIs(F, DA, DT);
+}
+
+FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() {
+  return new AMDGPURewriteUndefForPHI();
+}
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -380,6 +380,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
  initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
  initializeAMDGPULowerModuleLDSPass(*PR);
  initializeAMDGPURewriteOutArgumentsPass(*PR);
+  initializeAMDGPURewriteUndefForPHIPass(*PR);
  initializeAMDGPUUnifyMetadataPass(*PR);
  initializeSIAnnotateControlFlowPass(*PR);
  initializeAMDGPUReleaseVGPRsPass(*PR);
@ -1198,6 +1199,10 @@ bool GCNPassConfig::addPreISel() {
  addPass(createAMDGPUAnnotateUniformValues());
  if (!LateCFGStructurize) {
    addPass(createSIAnnotateControlFlowPass());
+    // TODO: Move this right after structurizeCFG to avoid extra divergence
+    // analysis. This depends on stopping SIAnnotateControlFlow from making
+    // control flow modifications.
+    addPass(createAMDGPURewriteUndefForPHIPass());
  }
  addPass(createLCSSAPass());

--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -92,6 +92,7 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPUReplaceLDSUseWithPointer.cpp
  AMDGPUResourceUsageAnalysis.cpp
  AMDGPURewriteOutArguments.cpp
+  AMDGPURewriteUndefForPHI.cpp
  AMDGPUSetWavePriority.cpp
  AMDGPUSubtarget.cpp
  AMDGPUTargetMachine.cpp
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@ -83,6 +83,9 @@
 ; GCN-O0-NEXT:        Memory SSA
 ; GCN-O0-NEXT:        AMDGPU Annotate Uniform Values
 ; GCN-O0-NEXT:        SI annotate control flow
+; GCN-O0-NEXT:        Post-Dominator Tree Construction
+; GCN-O0-NEXT:        Legacy Divergence Analysis
+; GCN-O0-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O0-NEXT:        LCSSA Verifier
 ; GCN-O0-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O0-NEXT:      DummyCGSCCPass
@ -264,6 +267,9 @@
 ; GCN-O1-NEXT:        Memory SSA
 ; GCN-O1-NEXT:        AMDGPU Annotate Uniform Values
 ; GCN-O1-NEXT:        SI annotate control flow
+; GCN-O1-NEXT:        Post-Dominator Tree Construction
+; GCN-O1-NEXT:        Legacy Divergence Analysis
+; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-NEXT:        LCSSA Verifier
 ; GCN-O1-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-NEXT:      DummyCGSCCPass
@ -548,6 +554,9 @@
 ; GCN-O1-OPTS-NEXT:        Memory SSA
 ; GCN-O1-OPTS-NEXT:        AMDGPU Annotate Uniform Values
 ; GCN-O1-OPTS-NEXT:        SI annotate control flow
+; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:        Legacy Divergence Analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-OPTS-NEXT:        LCSSA Verifier
 ; GCN-O1-OPTS-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
@ -840,6 +849,9 @@
 ; GCN-O2-NEXT:        Memory SSA
 ; GCN-O2-NEXT:        AMDGPU Annotate Uniform Values
 ; GCN-O2-NEXT:        SI annotate control flow
+; GCN-O2-NEXT:        Post-Dominator Tree Construction
+; GCN-O2-NEXT:        Legacy Divergence Analysis
+; GCN-O2-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O2-NEXT:        LCSSA Verifier
 ; GCN-O2-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O2-NEXT:      Analysis if a function is memory bound
@ -1147,6 +1159,9 @@
 ; GCN-O3-NEXT:        Memory SSA
 ; GCN-O3-NEXT:        AMDGPU Annotate Uniform Values
 ; GCN-O3-NEXT:        SI annotate control flow
+; GCN-O3-NEXT:        Post-Dominator Tree Construction
+; GCN-O3-NEXT:        Legacy Divergence Analysis
+; GCN-O3-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O3-NEXT:        LCSSA Verifier
 ; GCN-O3-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O3-NEXT:      Analysis if a function is memory bound
--- a/llvm/test/CodeGen/AMDGPU/rewrite-undef-for-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-undef-for-phi.ll
@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-rewrite-undef-for-phi %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_ps float @basic(float inreg %c, i32 %x) #0 {
+; OPT-LABEL: @basic(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0
+; OPT-NEXT:    br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]]
+; OPT:       if:
+; OPT-NEXT:    br label [[END]]
+; OPT:       end:
+; OPT-NEXT:    ret float [[C:%.*]]
+;
+entry:
+  %cc = icmp slt i32 %x, 0
+  br i1 %cc, label %if, label %end
+
+if:
+  br label %end
+
+end:
+  %c2 = phi float [ undef, %if ], [ %c, %entry ]
+  ret float %c2
+}
+
+define amdgpu_ps float @with_uniform_region_inside(float inreg %c, i32 inreg %d, i32 %x) #0 {
+; OPT-LABEL: @with_uniform_region_inside(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0
+; OPT-NEXT:    br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]]
+; OPT:       if:
+; OPT-NEXT:    [[CC2:%.*]] = icmp slt i32 [[D:%.*]], 0
+; OPT-NEXT:    br i1 [[CC2]], label [[BB2:%.*]], label [[BB3:%.*]]
+; OPT:       bb2:
+; OPT-NEXT:    br label [[END]]
+; OPT:       bb3:
+; OPT-NEXT:    [[CC3:%.*]] = icmp slt i32 [[D]], 2
+; OPT-NEXT:    br i1 [[CC3]], label [[BB4:%.*]], label [[END]]
+; OPT:       bb4:
+; OPT-NEXT:    br label [[END]]
+; OPT:       end:
+; OPT-NEXT:    ret float [[C:%.*]]
+;
+entry:
+  %cc = icmp slt i32 %x, 0
+  br i1 %cc, label %if, label %end
+
+if:
+  %cc2 = icmp slt i32 %d, 0
+  br i1 %cc2, label %bb2, label %bb3
+
+bb2:
+  br label %end
+
+bb3:
+  %cc3 = icmp slt i32 %d, 2
+  br i1 %cc3, label %bb4, label %end
+
+bb4:
+  br label %end
+
+end:
+  %c2 = phi float [ undef, %bb2 ], [ %c, %bb3 ], [ undef, %bb4 ], [ %c, %entry ]
+  ret float %c2
+}
+
+define amdgpu_ps float @exclude_backedge(float inreg %c, i32 %x) #0 {
+; OPT-LABEL: @exclude_backedge(
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0
+; OPT-NEXT:    br i1 [[CC]], label [[END:%.*]], label [[LOOP:%.*]]
+; OPT:       loop:
+; OPT-NEXT:    [[IND:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; OPT-NEXT:    [[C2:%.*]] = phi float [ [[C:%.*]], [[ENTRY]] ], [ undef, [[LOOP]] ]
+; OPT-NEXT:    [[INC]] = add i32 [[IND]], 1
+; OPT-NEXT:    [[LOOP_CC:%.*]] = icmp slt i32 [[INC]], 5
+; OPT-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[LOOP_END:%.*]]
+; OPT:       loop_end:
+; OPT-NEXT:    br label [[END]]
+; OPT:       end:
+; OPT-NEXT:    [[R:%.*]] = phi float [ [[C2]], [[LOOP_END]] ], [ [[C]], [[ENTRY]] ]
+; OPT-NEXT:    ret float [[R]]
+;
+entry:
+  %cc = icmp slt i32 %x, 0
+  br i1 %cc, label %end, label %loop
+
+loop:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %c2 = phi float [ %c, %entry ], [ undef, %loop ]
+  %inc = add i32 %ind, 1
+  %loop_cc = icmp slt i32 %inc, 5
+  br i1 %loop_cc, label %loop, label %loop_end
+
+loop_end:
+  br label %end
+
+end:
+  %r = phi float [ %c2, %loop_end ], [ %c, %entry ]
+  ret float %r
+}
+
+attributes #0 = { nounwind noinline }
--- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll
@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
+;
+; This test shows a typical case that a PHI(%c2) in join block was treated as uniform
+; as it has one unique uniform incoming value plus one additional undef incoming
+; value. This case might suffer from correctness issue if %c2 was assigned a scalar
+; register but meanwhile dead in %if. The problem is solved by replacing the %undef
+; with %c (thus replacing %c2 with %c in this example).
+
+
+define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, i32 %y) #0 {
+; GCN-LABEL: uniform_phi_with_undef:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cmp_lt_i32_e64 s2, v2, v1
+; GCN-NEXT:    s_mov_b32 s1, exec_lo
+; GCN-NEXT:    s_and_b32 s2, s1, s2
+; GCN-NEXT:    s_mov_b32 exec_lo, s2
+; GCN-NEXT:    s_cbranch_execz .LBB0_2
+; GCN-NEXT:  ; %bb.1: ; %if
+; GCN-NEXT:    s_mov_b32 s2, 2.0
+; GCN-NEXT:    v_div_scale_f32 v1, s3, s2, s2, v0
+; GCN-NEXT:    v_rcp_f32_e64 v2, v1
+; GCN-NEXT:    s_mov_b32 s3, 1.0
+; GCN-NEXT:    v_fma_f32 v3, -v1, v2, s3
+; GCN-NEXT:    v_fmac_f32_e64 v2, v3, v2
+; GCN-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, s2, v0
+; GCN-NEXT:    v_mul_f32_e64 v4, v3, v2
+; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GCN-NEXT:    v_fmac_f32_e64 v4, v5, v2
+; GCN-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GCN-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GCN-NEXT:    v_div_fixup_f32 v0, v1, s2, v0
+; GCN-NEXT:  .LBB0_2: ; %end
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    v_add_f32_e64 v0, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %cc = icmp slt i32 %y, %x
+  br i1 %cc, label %if, label %end
+
+if:
+  %v.if = fdiv float %v, 2.0
+  br label %end
+
+end:
+  %v2 = phi float [ %v.if, %if ], [ %v, %entry ]
+  %c2 = phi float [ undef, %if ], [ %c, %entry ]
+  %r = fadd float %v2, %c2
+  ret float %r
+}
+
+attributes #0 = { nounwind optnone noinline }