llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNod...

//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
// there is at most one ret and one unreachable instruction, it ensures there is
// at most one divergent exiting block.
//
// StructurizeCFG can't deal with multi-exit regions formed by branches to
// multiple return nodes. It is not desirable to structurize regions with
// uniform branches, so unifying those to the same return block as divergent
// branches inhibits use of scalar branching. It still can't deal with the case
// where one branch goes to return, and one unreachable. Replace unreachable in
// this case with a return.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;

#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"

namespace {

class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
public:
  static char ID; // Pass identification, replacement for typeid
  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
  }

  // We can preserve non-critical-edgeness when we unify function exit nodes
  void getAnalysisUsage(AnalysisUsage &AU) const override;
  bool runOnFunction(Function &F) override;
};

}

char AMDGPUUnifyDivergentExitNodes::ID = 0;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                     "Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                    "Unify divergent function exit nodes", false, false)

char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;

void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
  // TODO: Preserve dominator tree.
  AU.addRequired<PostDominatorTreeWrapperPass>();

  AU.addRequired<DivergenceAnalysis>();

  // No divergent values are changed, only blocks and branch edges.
  AU.addPreserved<DivergenceAnalysis>();

  // We preserve the non-critical-edgeness property
  AU.addPreservedID(BreakCriticalEdgesID);

  // This is a cluster of orthogonal Transforms
  AU.addPreservedID(LowerSwitchID);
  FunctionPass::getAnalysisUsage(AU);

  AU.addRequired<TargetTransformInfoWrapperPass>();
}

/// \returns true if \p BB is reachable through only uniform branches.
/// XXX - Is there a more efficient way to find this?
static bool isUniformlyReached(const DivergenceAnalysis &DA,
                               BasicBlock &BB) {
  SmallVector<BasicBlock *, 8> Stack;
  SmallPtrSet<BasicBlock *, 8> Visited;

  for (BasicBlock *Pred : predecessors(&BB))
    Stack.push_back(Pred);

  while (!Stack.empty()) {
    BasicBlock *Top = Stack.pop_back_val();
    if (!DA.isUniform(Top->getTerminator()))
      return false;

    for (BasicBlock *Pred : predecessors(Top)) {
      if (Visited.insert(Pred).second)
        Stack.push_back(Pred);
    }
  }

  return true;
}

static BasicBlock *unifyReturnBlockSet(Function &F,
                                       ArrayRef<BasicBlock *> ReturningBlocks,
                                       const TargetTransformInfo &TTI,
                                       StringRef Name) {
  // Otherwise, we need to insert a new basic block into the function, add a PHI
  // nodes (if the function returns values), and convert all of the return
  // instructions into unconditional branches.
  //
  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);

  PHINode *PN = nullptr;
  if (F.getReturnType()->isVoidTy()) {
    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
  } else {
    // If the function doesn't return void... add a PHI node to the block...
    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
                         "UnifiedRetVal");
    NewRetBlock->getInstList().push_back(PN);
    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
  }

  // Loop over all of the blocks, replacing the return instruction with an
  // unconditional branch.
  //
  for (BasicBlock *BB : ReturningBlocks) {
    // Add an incoming element to the PHI node for every return instruction that
    // is merging into this new block...
    if (PN)
      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);

    BB->getInstList().pop_back();  // Remove the return insn
    BranchInst::Create(NewRetBlock, BB);
  }

  for (BasicBlock *BB : ReturningBlocks) {
    // Cleanup possible branch to unconditional branch to the return.
    SimplifyCFG(BB, TTI, nullptr, {2});
  }

  return NewRetBlock;
}

bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
  if (PDT.getRoots().size() <= 1)
    return false;

  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();

  // Loop over all of the blocks in a function, tracking all of the blocks that
  // return.
  //
  SmallVector<BasicBlock *, 4> ReturningBlocks;
  SmallVector<BasicBlock *, 4> UnreachableBlocks;

  for (BasicBlock *BB : PDT.getRoots()) {
    if (isa<ReturnInst>(BB->getTerminator())) {
      if (!isUniformlyReached(DA, *BB))
        ReturningBlocks.push_back(BB);
    } else if (isa<UnreachableInst>(BB->getTerminator())) {
      if (!isUniformlyReached(DA, *BB))
        UnreachableBlocks.push_back(BB);
    }
  }

  if (!UnreachableBlocks.empty()) {
    BasicBlock *UnreachableBlock = nullptr;

    if (UnreachableBlocks.size() == 1) {
      UnreachableBlock = UnreachableBlocks.front();
    } else {
      UnreachableBlock = BasicBlock::Create(F.getContext(),
                                            "UnifiedUnreachableBlock", &F);
      new UnreachableInst(F.getContext(), UnreachableBlock);

      for (BasicBlock *BB : UnreachableBlocks) {
        BB->getInstList().pop_back();  // Remove the unreachable inst.
        BranchInst::Create(UnreachableBlock, BB);
      }
    }

    if (!ReturningBlocks.empty()) {
      // Don't create a new unreachable inst if we have a return. The
      // structurizer/annotator can't handle the multiple exits

      Type *RetTy = F.getReturnType();
      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.

      Function *UnreachableIntrin =
        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);

      // Insert a call to an intrinsic tracking that this is an unreachable
      // point, in case we want to kill the active lanes or something later.
      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);

      // Don't create a scalar trap. We would only want to trap if this code was
      // really reached, but a scalar trap would happen even if no lanes
      // actually reached here.
      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
      ReturningBlocks.push_back(UnreachableBlock);
    }
  }

  // Now handle return blocks.
  if (ReturningBlocks.empty())
    return false; // No blocks return

  if (ReturningBlocks.size() == 1)
    return false; // Already has a single return block

  const TargetTransformInfo &TTI
    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
  return true;
}
AMDGPU: Unify divergent function exits. StructurizeCFG can't handle cases with multiple returns creating regions with multiple exits. Create a copy of UnifyFunctionExitNodes that only unifies exit nodes that skips exit nodes with uniform branch sources. llvm-svn: 298729 2017-03-25 03:52:05 +08:00			`//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring`
			`// there is at most one ret and one unreachable instruction, it ensures there is`
			`// at most one divergent exiting block.`
			`//`
			`// StructurizeCFG can't deal with multi-exit regions formed by branches to`
			`// multiple return nodes. It is not desirable to structurize regions with`
			`// uniform branches, so unifying those to the same return block as divergent`
			`// branches inhibits use of scalar branching. It still can't deal with the case`
			`// where one branch goes to return, and one unreachable. Replace unreachable in`
			`// this case with a return.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "AMDGPU.h"`
			`#include "llvm/ADT/DepthFirstIterator.h"`
			`#include "llvm/ADT/StringExtras.h"`
			`#include "llvm/Analysis/DivergenceAnalysis.h"`
			`#include "llvm/Analysis/PostDominators.h"`
			`#include "llvm/Analysis/TargetTransformInfo.h"`
			`#include "llvm/IR/BasicBlock.h"`
			`#include "llvm/IR/CFG.h"`
			`#include "llvm/IR/Function.h"`
			`#include "llvm/IR/Instructions.h"`
			`#include "llvm/IR/Type.h"`
			`#include "llvm/Transforms/Scalar.h"`
			`#include "llvm/Transforms/Utils/Local.h"`
			`using namespace llvm;`

			`#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"`

			`namespace {`

			`class AMDGPUUnifyDivergentExitNodes : public FunctionPass {`
			`public:`
			`static char ID; // Pass identification, replacement for typeid`
			`AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {`
			`initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());`
			`}`

			`// We can preserve non-critical-edgeness when we unify function exit nodes`
			`void getAnalysisUsage(AnalysisUsage &AU) const override;`
			`bool runOnFunction(Function &F) override;`
			`};`

			`}`

			`char AMDGPUUnifyDivergentExitNodes::ID = 0;`
			`INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,`
			`"Unify divergent function exit nodes", false, false)`
			`INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)`
			`INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)`
			`INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,`
			`"Unify divergent function exit nodes", false, false)`

			`char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;`

			`void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{`
			`// TODO: Preserve dominator tree.`
			`AU.addRequired<PostDominatorTreeWrapperPass>();`

			`AU.addRequired<DivergenceAnalysis>();`

			`// No divergent values are changed, only blocks and branch edges.`
			`AU.addPreserved<DivergenceAnalysis>();`

			`// We preserve the non-critical-edgeness property`
			`AU.addPreservedID(BreakCriticalEdgesID);`

			`// This is a cluster of orthogonal Transforms`
			`AU.addPreservedID(LowerSwitchID);`
			`FunctionPass::getAnalysisUsage(AU);`

			`AU.addRequired<TargetTransformInfoWrapperPass>();`
			`}`

			`/// \returns true if \p BB is reachable through only uniform branches.`
			`/// XXX - Is there a more efficient way to find this?`
			`static bool isUniformlyReached(const DivergenceAnalysis &DA,`
			`BasicBlock &BB) {`
			`SmallVector<BasicBlock *, 8> Stack;`
			`SmallPtrSet<BasicBlock *, 8> Visited;`

			`for (BasicBlock *Pred : predecessors(&BB))`
			`Stack.push_back(Pred);`

			`while (!Stack.empty()) {`
			`BasicBlock *Top = Stack.pop_back_val();`
			`if (!DA.isUniform(Top->getTerminator()))`
			`return false;`

			`for (BasicBlock *Pred : predecessors(Top)) {`
			`if (Visited.insert(Pred).second)`
			`Stack.push_back(Pred);`
			`}`
			`}`

			`return true;`
			`}`

			`static BasicBlock *unifyReturnBlockSet(Function &F,`
			`ArrayRef<BasicBlock *> ReturningBlocks,`
			`const TargetTransformInfo &TTI,`
			`StringRef Name) {`
			`// Otherwise, we need to insert a new basic block into the function, add a PHI`
			`// nodes (if the function returns values), and convert all of the return`
			`// instructions into unconditional branches.`
			`//`
			`BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);`

			`PHINode *PN = nullptr;`
			`if (F.getReturnType()->isVoidTy()) {`
			`ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);`
			`} else {`
			`// If the function doesn't return void... add a PHI node to the block...`
			`PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),`
			`"UnifiedRetVal");`
			`NewRetBlock->getInstList().push_back(PN);`
			`ReturnInst::Create(F.getContext(), PN, NewRetBlock);`
			`}`

			`// Loop over all of the blocks, replacing the return instruction with an`
			`// unconditional branch.`
			`//`
			`for (BasicBlock *BB : ReturningBlocks) {`
			`// Add an incoming element to the PHI node for every return instruction that`
			`// is merging into this new block...`
			`if (PN)`
			`PN->addIncoming(BB->getTerminator()->getOperand(0), BB);`

			`BB->getInstList().pop_back(); // Remove the return insn`
			`BranchInst::Create(NewRetBlock, BB);`
			`}`

			`for (BasicBlock *BB : ReturningBlocks) {`
			`// Cleanup possible branch to unconditional branch to the return.`
[SimplifyCFG] add a struct to house optional folds (PR34603) This was intended to be no-functional-change, but it's not - there's a test diff. So I thought I should stop here and post it as-is to see if this looks like what was expected based on the discussion in PR34603: https://bugs.llvm.org/show_bug.cgi?id=34603 Notes: 1. The test improvement occurs because the existing 'LateSimplifyCFG' marker is not carried through the recursive calls to 'SimplifyCFG()->SimplifyCFGOpt().run()->SimplifyCFG()'. The parameter isn't passed down, so we pick up the default value from the function signature after the first level. I assumed that was a bug, so I've passed 'Options' down in all of the 'SimplifyCFG' calls. 2. I split 'LateSimplifyCFG' into 2 bits: ConvertSwitchToLookupTable and KeepCanonicalLoops. This would theoretically allow us to differentiate the transforms controlled by those params independently. 3. We could stash the optional AssumptionCache pointer and 'LoopHeaders' pointer in the struct too. I just stopped here to minimize the diffs. 4. Similarly, I stopped short of messing with the pass manager layer. I have another question that could wait for the follow-up: why is the new pass manager creating the pass with LateSimplifyCFG set to true no matter where in the pipeline it's creating SimplifyCFG passes? // Create an early function pass manager to cleanup the output of the // frontend. EarlyFPM.addPass(SimplifyCFGPass()); --> /// \brief Construct a pass with the default thresholds /// and switch optimizations. SimplifyCFGPass::SimplifyCFGPass() : BonusInstThreshold(UserBonusInstThreshold), LateSimplifyCFG(true) {} <-- switches get converted to lookup tables and loops may not be in canonical form If this is unintended, then it's possible that the current behavior of dropping the 'LateSimplifyCFG' setting via recursion was masking this bug. Differential Revision: https://reviews.llvm.org/D38138 llvm-svn: 314308 2017-09-27 22:54:16 +08:00			`SimplifyCFG(BB, TTI, nullptr, {2});`
AMDGPU: Unify divergent function exits. StructurizeCFG can't handle cases with multiple returns creating regions with multiple exits. Create a copy of UnifyFunctionExitNodes that only unifies exit nodes that skips exit nodes with uniform branch sources. llvm-svn: 298729 2017-03-25 03:52:05 +08:00			`}`

			`return NewRetBlock;`
			`}`

			`bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {`
			`auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();`
			`if (PDT.getRoots().size() <= 1)`
			`return false;`

			`DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();`

			`// Loop over all of the blocks in a function, tracking all of the blocks that`
			`// return.`
			`//`
			`SmallVector<BasicBlock *, 4> ReturningBlocks;`
			`SmallVector<BasicBlock *, 4> UnreachableBlocks;`

			`for (BasicBlock *BB : PDT.getRoots()) {`
			`if (isa<ReturnInst>(BB->getTerminator())) {`
			`if (!isUniformlyReached(DA, *BB))`
			`ReturningBlocks.push_back(BB);`
			`} else if (isa<UnreachableInst>(BB->getTerminator())) {`
			`if (!isUniformlyReached(DA, *BB))`
			`UnreachableBlocks.push_back(BB);`
			`}`
			`}`

			`if (!UnreachableBlocks.empty()) {`
			`BasicBlock *UnreachableBlock = nullptr;`

			`if (UnreachableBlocks.size() == 1) {`
			`UnreachableBlock = UnreachableBlocks.front();`
			`} else {`
			`UnreachableBlock = BasicBlock::Create(F.getContext(),`
			`"UnifiedUnreachableBlock", &F);`
			`new UnreachableInst(F.getContext(), UnreachableBlock);`

			`for (BasicBlock *BB : UnreachableBlocks) {`
			`BB->getInstList().pop_back(); // Remove the unreachable inst.`
			`BranchInst::Create(UnreachableBlock, BB);`
			`}`
			`}`

			`if (!ReturningBlocks.empty()) {`
			`// Don't create a new unreachable inst if we have a return. The`
			`// structurizer/annotator can't handle the multiple exits`

			`Type *RetTy = F.getReturnType();`
			`Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);`
			`UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.`

			`Function *UnreachableIntrin =`
			`Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);`

			`// Insert a call to an intrinsic tracking that this is an unreachable`
			`// point, in case we want to kill the active lanes or something later.`
			`CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);`

			`// Don't create a scalar trap. We would only want to trap if this code was`
			`// really reached, but a scalar trap would happen even if no lanes`
			`// actually reached here.`
			`ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);`
			`ReturningBlocks.push_back(UnreachableBlock);`
			`}`
			`}`

			`// Now handle return blocks.`
			`if (ReturningBlocks.empty())`
			`return false; // No blocks return`

			`if (ReturningBlocks.size() == 1)`
			`return false; // Already has a single return block`

			`const TargetTransformInfo &TTI`
			`= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);`

			`unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");`
			`return true;`
			`}`