llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp

1738 lines
56 KiB
C++

//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//==-----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstddef>
#include <deque>
#include <iterator>
#include <map>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "structcfg"
#define DEFAULT_VEC_SLOTS 8
// TODO: move-begin.
//===----------------------------------------------------------------------===//
//
// Statistics for CFGStructurizer.
//
//===----------------------------------------------------------------------===//
STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern "
"matched");
STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern "
"matched");
STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks");
STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
namespace llvm {
void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
} // end namespace llvm
namespace {
//===----------------------------------------------------------------------===//
//
// Miscellaneous utility for CFGStructurizer.
//
//===----------------------------------------------------------------------===//
#define SHOWNEWINSTR(i) \
DEBUG(dbgs() << "New instr: " << *i << "\n");
#define SHOWNEWBLK(b, msg) \
DEBUG( \
dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
dbgs() << "\n"; \
);
#define SHOWBLK_DETAIL(b, msg) \
DEBUG( \
if (b) { \
dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
b->print(dbgs()); \
dbgs() << "\n"; \
} \
);
#define INVALIDSCCNUM -1
//===----------------------------------------------------------------------===//
//
// supporting data structure for CFGStructurizer
//
//===----------------------------------------------------------------------===//
class BlockInformation {
public:
bool IsRetired = false;
int SccNum = INVALIDSCCNUM;
BlockInformation() = default;
};
//===----------------------------------------------------------------------===//
//
// CFGStructurizer
//
//===----------------------------------------------------------------------===//
class AMDGPUCFGStructurizer : public MachineFunctionPass {
public:
using MBBVector = SmallVector<MachineBasicBlock *, 32>;
using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;
enum PathToKind {
Not_SinglePath = 0,
SinglePath_InPath = 1,
SinglePath_NotInPath = 2
};
static char ID;
AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
}
StringRef getPassName() const override {
return "AMDGPU Control Flow Graph structurizer Pass";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.addRequired<MachinePostDominatorTree>();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
/// Perform the CFG structurization
bool run();
/// Perform the CFG preparation
/// This step will remove every unconditionnal/dead jump instructions and make
/// sure all loops have an exit block
bool prepare();
bool runOnMachineFunction(MachineFunction &MF) override {
TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
TRI = &TII->getRegisterInfo();
DEBUG(MF.dump(););
OrderedBlks.clear();
Visited.clear();
FuncRep = &MF;
MLI = &getAnalysis<MachineLoopInfo>();
DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
MDT = &getAnalysis<MachineDominatorTree>();
DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
PDT = &getAnalysis<MachinePostDominatorTree>();
DEBUG(PDT->print(dbgs()););
prepare();
run();
DEBUG(MF.dump(););
return true;
}
protected:
MachineDominatorTree *MDT;
MachinePostDominatorTree *PDT;
MachineLoopInfo *MLI;
const R600InstrInfo *TII = nullptr;
const R600RegisterInfo *TRI = nullptr;
// PRINT FUNCTIONS
/// Print the ordered Blocks.
void printOrderedBlocks() const {
size_t i = 0;
for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
dbgs() << "BB" << (*iterBlk)->getNumber();
dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
if (i != 0 && i % 10 == 0) {
dbgs() << "\n";
} else {
dbgs() << " ";
}
}
}
static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
for (MachineLoop::iterator iter = LoopInfo.begin(),
iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
(*iter)->print(dbgs(), 0);
}
}
// UTILITY FUNCTIONS
int getSCCNum(MachineBasicBlock *MBB) const;
MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
bool hasBackEdge(MachineBasicBlock *MBB) const;
bool isRetiredBlock(MachineBasicBlock *MBB) const;
bool isActiveLoophead(MachineBasicBlock *MBB) const;
PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
bool AllowSideEntry = true) const;
int countActiveBlock(MBBVector::const_iterator It,
MBBVector::const_iterator E) const;
bool needMigrateBlock(MachineBasicBlock *MBB) const;
// Utility Functions
void reversePredicateSetter(MachineBasicBlock::iterator I,
MachineBasicBlock &MBB);
/// Compute the reversed DFS post order of Blocks
void orderBlocks(MachineFunction *MF);
// Function originally from CFGStructTraits
void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
const DebugLoc &DL = DebugLoc());
MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
const DebugLoc &DL = DebugLoc());
MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
const DebugLoc &DL);
void insertCondBranchBefore(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, int NewOpcode,
int RegNum, const DebugLoc &DL);
static int getBranchNzeroOpcode(int OldOpcode);
static int getBranchZeroOpcode(int OldOpcode);
static int getContinueNzeroOpcode(int OldOpcode);
static int getContinueZeroOpcode(int OldOpcode);
static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
MachineInstr *MI);
static bool isCondBranch(MachineInstr *MI);
static bool isUncondBranch(MachineInstr *MI);
static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
/// The correct naming for this is getPossibleLoopendBlockBranchInstr.
///
/// BB with backward-edge could have move instructions after the branch
/// instruction. Such move instruction "belong to" the loop backward-edge.
MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
static bool isReturnBlock(MachineBasicBlock *MBB);
static void cloneSuccessorList(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB);
static MachineBasicBlock *clone(MachineBasicBlock *MBB);
/// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
/// because the AMDGPU instruction is not recognized as terminator fix this
/// and retire this routine
void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
static void wrapup(MachineBasicBlock *MBB);
int patternMatch(MachineBasicBlock *MBB);
int patternMatchGroup(MachineBasicBlock *MBB);
int serialPatternMatch(MachineBasicBlock *MBB);
int ifPatternMatch(MachineBasicBlock *MBB);
int loopendPatternMatch();
int mergeLoop(MachineLoop *LoopRep);
/// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
/// the same loop with LoopLandInfo without explicitly keeping track of
/// loopContBlks and loopBreakBlks, this is a method to get the information.
bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
MachineBasicBlock *Src2MBB);
int handleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock **LandMBBPtr);
void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock *LandMBB, bool Detail = false);
int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
void mergeSerialBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB);
void mergeIfthenelseBlock(MachineInstr *BranchMI,
MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
void mergeLooplandBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *LandMBB);
void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *LandMBB);
void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB);
/// normalizeInfiniteLoopExit change
/// B1:
/// uncond_br LoopHeader
///
/// to
/// B1:
/// cond_br 1 LoopHeader dummyExit
/// and return the newly added dummy exit block
MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
void removeUnconditionalBranch(MachineBasicBlock *MBB);
/// Remove duplicate branches instructions in a block.
/// For instance
/// B0:
/// cond_br X B1 B2
/// cond_br X B1 B2
/// is transformed to
/// B0:
/// cond_br X B1 B2
void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
void removeSuccessor(MachineBasicBlock *MBB);
MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
MachineBasicBlock *PredMBB);
void migrateInstruction(MachineBasicBlock *SrcMBB,
MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
void retireBlock(MachineBasicBlock *MBB);
private:
MBBInfoMap BlockInfoMap;
LoopLandInfoMap LLInfoMap;
std::map<MachineLoop *, bool> Visited;
MachineFunction *FuncRep;
SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
};
} // end anonymous namespace
char AMDGPUCFGStructurizer::ID = 0;
int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
if (It == BlockInfoMap.end())
return INVALIDSCCNUM;
return (*It).second->SccNum;
}
MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
const {
LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
if (It == LLInfoMap.end())
return nullptr;
return (*It).second;
}
bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
MachineLoop *LoopRep = MLI->getLoopFor(MBB);
if (!LoopRep)
return false;
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
return MBB->isSuccessor(LoopHeader);
}
bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
if (It == BlockInfoMap.end())
return false;
return (*It).second->IsRetired;
}
bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
MachineLoop *LoopRep = MLI->getLoopFor(MBB);
while (LoopRep && LoopRep->getHeader() == MBB) {
MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
if(!LoopLand)
return true;
if (!isRetiredBlock(LoopLand))
return true;
LoopRep = LoopRep->getParentLoop();
}
return false;
}
AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
bool AllowSideEntry) const {
assert(DstMBB);
if (SrcMBB == DstMBB)
return SinglePath_InPath;
while (SrcMBB && SrcMBB->succ_size() == 1) {
SrcMBB = *SrcMBB->succ_begin();
if (SrcMBB == DstMBB)
return SinglePath_InPath;
if (!AllowSideEntry && SrcMBB->pred_size() > 1)
return Not_SinglePath;
}
if (SrcMBB && SrcMBB->succ_size()==0)
return SinglePath_NotInPath;
return Not_SinglePath;
}
int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
MBBVector::const_iterator E) const {
int Count = 0;
while (It != E) {
if (!isRetiredBlock(*It))
++Count;
++It;
}
return Count;
}
bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
unsigned BlockSizeThreshold = 30;
unsigned CloneInstrThreshold = 100;
bool MultiplePreds = MBB && (MBB->pred_size() > 1);
if(!MultiplePreds)
return false;
unsigned BlkSize = MBB->size();
return ((BlkSize > BlockSizeThreshold) &&
(BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
}
void AMDGPUCFGStructurizer::reversePredicateSetter(
MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
assert(I.isValid() && "Expected valid iterator");
for (;; --I) {
if (I == MBB.end())
continue;
if (I->getOpcode() == AMDGPU::PRED_X) {
switch (I->getOperand(2).getImm()) {
case AMDGPU::PRED_SETE_INT:
I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
return;
case AMDGPU::PRED_SETNE_INT:
I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
return;
case AMDGPU::PRED_SETE:
I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
return;
case AMDGPU::PRED_SETNE:
I->getOperand(2).setImm(AMDGPU::PRED_SETE);
return;
default:
llvm_unreachable("PRED_X Opcode invalid!");
}
}
}
}
void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
int NewOpcode, const DebugLoc &DL) {
MachineInstr *MI =
MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
MBB->push_back(MI);
//assume the instruction doesn't take any reg operand ...
SHOWNEWINSTR(MI);
}
MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
int NewOpcode,
const DebugLoc &DL) {
MachineInstr *MI =
MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
if (MBB->begin() != MBB->end())
MBB->insert(MBB->begin(), MI);
else
MBB->push_back(MI);
SHOWNEWINSTR(MI);
return MI;
}
MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
MachineBasicBlock::iterator I, int NewOpcode) {
MachineInstr *OldMI = &(*I);
MachineBasicBlock *MBB = OldMI->getParent();
MachineInstr *NewMBB =
MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
MBB->insert(I, NewMBB);
//assume the instruction doesn't take any reg operand ...
SHOWNEWINSTR(NewMBB);
return NewMBB;
}
void AMDGPUCFGStructurizer::insertCondBranchBefore(
MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
MachineInstr *OldMI = &(*I);
MachineBasicBlock *MBB = OldMI->getParent();
MachineFunction *MF = MBB->getParent();
MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
MBB->insert(I, NewMI);
MachineInstrBuilder MIB(*MF, NewMI);
MIB.addReg(OldMI->getOperand(1).getReg(), false);
SHOWNEWINSTR(NewMI);
//erase later oldInstr->eraseFromParent();
}
void AMDGPUCFGStructurizer::insertCondBranchBefore(
MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
int RegNum, const DebugLoc &DL) {
MachineFunction *MF = blk->getParent();
MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
//insert before
blk->insert(I, NewInstr);
MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
SHOWNEWINSTR(NewInstr);
}
int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
}
int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
default: llvm_unreachable("internal error");
}
return -1;
}
int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
}
int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
switch(OldOpcode) {
case AMDGPU::JUMP_COND:
case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
default: llvm_unreachable("internal error");
}
return -1;
}
MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
return MI->getOperand(0).getMBB();
}
void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
MachineBasicBlock *MBB) {
MI->getOperand(0).setMBB(MBB);
}
MachineBasicBlock *
AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
MachineInstr *MI) {
assert(MBB->succ_size() == 2);
MachineBasicBlock *TrueBranch = getTrueBranch(MI);
MachineBasicBlock::succ_iterator It = MBB->succ_begin();
MachineBasicBlock::succ_iterator Next = It;
++Next;
return (*It == TrueBranch) ? *Next : *It;
}
bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
case AMDGPU::JUMP_COND:
case AMDGPU::BRANCH_COND_i32:
case AMDGPU::BRANCH_COND_f32: return true;
default:
return false;
}
return false;
}
bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
switch (MI->getOpcode()) {
case AMDGPU::JUMP:
case AMDGPU::BRANCH:
return true;
default:
return false;
}
return false;
}
DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
//get DebugLoc from the first MachineBasicBlock instruction with debug info
DebugLoc DL;
for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
++It) {
MachineInstr *instr = &(*It);
if (instr->getDebugLoc())
DL = instr->getDebugLoc();
}
return DL;
}
MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
MachineInstr *MI = &*It;
if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
return MI;
return nullptr;
}
MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
MachineBasicBlock *MBB) {
for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
It != E; ++It) {
// FIXME: Simplify
MachineInstr *MI = &*It;
if (MI) {
if (isCondBranch(MI) || isUncondBranch(MI))
return MI;
else if (!TII->isMov(MI->getOpcode()))
break;
}
}
return nullptr;
}
MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
MachineBasicBlock::reverse_iterator It = MBB->rbegin();
if (It != MBB->rend()) {
MachineInstr *instr = &(*It);
if (instr->getOpcode() == AMDGPU::RETURN)
return instr;
}
return nullptr;
}
bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
MachineInstr *MI = getReturnInstr(MBB);
bool IsReturn = (MBB->succ_size() == 0);
if (MI)
assert(IsReturn);
else if (IsReturn)
DEBUG(
dbgs() << "BB" << MBB->getNumber()
<<" is return block without RETURN instr\n";);
return IsReturn;
}
void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of
}
MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
MachineFunction *Func = MBB->getParent();
MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
Func->push_back(NewMBB); //insert to function
for (const MachineInstr &It : *MBB)
NewMBB->push_back(Func->CloneMachineInstr(&It));
return NewMBB;
}
void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
MachineBasicBlock *NewBlk) {
MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
if (BranchMI && isCondBranch(BranchMI) &&
getTrueBranch(BranchMI) == OldMBB)
setTrueBranch(BranchMI, NewBlk);
}
void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
assert((!MBB->getParent()->getJumpTableInfo()
|| MBB->getParent()->getJumpTableInfo()->isEmpty())
&& "found a jump table");
//collect continue right before endloop
SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
MachineBasicBlock::iterator Pre = MBB->begin();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator It = Pre;
while (It != E) {
if (Pre->getOpcode() == AMDGPU::CONTINUE
&& It->getOpcode() == AMDGPU::ENDLOOP)
ContInstr.push_back(&*Pre);
Pre = It;
++It;
}
//delete continue right before endloop
for (unsigned i = 0; i < ContInstr.size(); ++i)
ContInstr[i]->eraseFromParent();
// TODO to fix up jump table so later phase won't be confused. if
// (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
// there isn't such an interface yet. alternatively, replace all the other
// blocks in the jump table with the entryBlk //}
}
bool AMDGPUCFGStructurizer::prepare() {
bool Changed = false;
//FIXME: if not reducible flow graph, make it so ???
DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
orderBlocks(FuncRep);
SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
// Add an ExitBlk to loop that don't have one
for (MachineLoopInfo::iterator It = MLI->begin(),
E = MLI->end(); It != E; ++It) {
MachineLoop *LoopRep = (*It);
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
if (ExitingMBBs.size() == 0) {
MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
if (DummyExitBlk)
RetBlks.push_back(DummyExitBlk);
}
}
// Remove unconditional branch instr.
// Add dummy exit block iff there are multiple returns.
for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
MachineBasicBlock *MBB = *It;
removeUnconditionalBranch(MBB);
removeRedundantConditionalBranch(MBB);
if (isReturnBlock(MBB)) {
RetBlks.push_back(MBB);
}
assert(MBB->succ_size() <= 2);
}
if (RetBlks.size() >= 2) {
addDummyExitBlock(RetBlks);
Changed = true;
}
return Changed;
}
bool AMDGPUCFGStructurizer::run() {
//Assume reducible CFG...
DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
#ifdef STRESSTEST
//Use the worse block ordering to test the algorithm.
ReverseVector(orderedBlks);
#endif
DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
int NumIter = 0;
bool Finish = false;
MachineBasicBlock *MBB;
bool MakeProgress = false;
int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
OrderedBlks.end());
do {
++NumIter;
DEBUG(
dbgs() << "numIter = " << NumIter
<< ", numRemaintedBlk = " << NumRemainedBlk << "\n";
);
SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
OrderedBlks.begin();
SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
OrderedBlks.end();
SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
It;
MachineBasicBlock *SccBeginMBB = nullptr;
int SccNumBlk = 0; // The number of active blocks, init to a
// maximum possible number.
int SccNumIter; // Number of iteration in this SCC.
while (It != E) {
MBB = *It;
if (!SccBeginMBB) {
SccBeginIter = It;
SccBeginMBB = MBB;
SccNumIter = 0;
SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
DEBUG(
dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
dbgs() << "\n";
);
}
if (!isRetiredBlock(MBB))
patternMatch(MBB);
++It;
bool ContNextScc = true;
if (It == E
|| getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
// Just finish one scc.
++SccNumIter;
int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
DEBUG(
dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
<< ", sccNumIter = " << SccNumIter;
dbgs() << "doesn't make any progress\n";
);
ContNextScc = true;
} else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
SccNumBlk = sccRemainedNumBlk;
It = SccBeginIter;
ContNextScc = false;
DEBUG(
dbgs() << "repeat processing SCC" << getSCCNum(MBB)
<< "sccNumIter = " << SccNumIter << '\n';
);
} else {
// Finish the current scc.
ContNextScc = true;
}
} else {
// Continue on next component in the current scc.
ContNextScc = false;
}
if (ContNextScc)
SccBeginMBB = nullptr;
} //while, "one iteration" over the function.
MachineBasicBlock *EntryMBB =
*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
DEBUG(
dbgs() << "Reduce to one block\n";
);
} else {
int NewnumRemainedBlk
= countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
// consider cloned blocks ??
if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
MakeProgress = true;
NumRemainedBlk = NewnumRemainedBlk;
} else {
MakeProgress = false;
DEBUG(
dbgs() << "No progress\n";
);
}
}
} while (!Finish && MakeProgress);
// Misc wrap up to maintain the consistency of the Function representation.
wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
// Detach retired Block, release memory.
for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
It != E; ++It) {
if ((*It).second && (*It).second->IsRetired) {
assert(((*It).first)->getNumber() != -1);
DEBUG(
dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
);
(*It).first->eraseFromParent(); //Remove from the parent Function.
}
delete (*It).second;
}
BlockInfoMap.clear();
LLInfoMap.clear();
if (!Finish) {
DEBUG(FuncRep->viewCFG());
report_fatal_error("IRREDUCIBLE_CFG");
}
return true;
}
void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
int SccNum = 0;
MachineBasicBlock *MBB;
for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
++It, ++SccNum) {
const std::vector<MachineBasicBlock *> &SccNext = *It;
for (std::vector<MachineBasicBlock *>::const_iterator
blockIter = SccNext.begin(), blockEnd = SccNext.end();
blockIter != blockEnd; ++blockIter) {
MBB = *blockIter;
OrderedBlks.push_back(MBB);
recordSccnum(MBB, SccNum);
}
}
// walk through all the block in func to check for unreachable
for (auto *MBB : nodes(MF)) {
SccNum = getSCCNum(MBB);
if (SccNum == INVALIDSCCNUM)
dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
}
}
int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
int NumMatch = 0;
int CurMatch;
DEBUG(
dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
);
while ((CurMatch = patternMatchGroup(MBB)) > 0)
NumMatch += CurMatch;
DEBUG(
dbgs() << "End patternMatch BB" << MBB->getNumber()
<< ", numMatch = " << NumMatch << "\n";
);
return NumMatch;
}
int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
int NumMatch = 0;
NumMatch += loopendPatternMatch();
NumMatch += serialPatternMatch(MBB);
NumMatch += ifPatternMatch(MBB);
return NumMatch;
}
int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
if (MBB->succ_size() != 1)
return 0;
MachineBasicBlock *childBlk = *MBB->succ_begin();
if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
return 0;
mergeSerialBlock(MBB, childBlk);
++numSerialPatternMatch;
return 1;
}
int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
//two edges
if (MBB->succ_size() != 2)
return 0;
if (hasBackEdge(MBB))
return 0;
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
if (!BranchMI)
return 0;
assert(isCondBranch(BranchMI));
int NumMatch = 0;
MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
NumMatch += serialPatternMatch(TrueMBB);
NumMatch += ifPatternMatch(TrueMBB);
MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
NumMatch += serialPatternMatch(FalseMBB);
NumMatch += ifPatternMatch(FalseMBB);
MachineBasicBlock *LandBlk;
int Cloned = 0;
assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
// TODO: Simplify
if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
&& *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
// Diamond pattern
LandBlk = *TrueMBB->succ_begin();
} else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
// Triangle pattern, false is empty
LandBlk = FalseMBB;
FalseMBB = nullptr;
} else if (FalseMBB->succ_size() == 1
&& *FalseMBB->succ_begin() == TrueMBB) {
// Triangle pattern, true is empty
// We reverse the predicate to make a triangle, empty false pattern;
std::swap(TrueMBB, FalseMBB);
reversePredicateSetter(MBB->end(), *MBB);
LandBlk = FalseMBB;
FalseMBB = nullptr;
} else if (FalseMBB->succ_size() == 1
&& isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
LandBlk = *FalseMBB->succ_begin();
} else if (TrueMBB->succ_size() == 1
&& isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
LandBlk = *TrueMBB->succ_begin();
} else {
return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
}
// improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
// new BB created for landBlk==NULL may introduce new challenge to the
// reduction process.
if (LandBlk &&
((TrueMBB && TrueMBB->pred_size() > 1)
|| (FalseMBB && FalseMBB->pred_size() > 1))) {
Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
}
if (TrueMBB && TrueMBB->pred_size() > 1) {
TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
++Cloned;
}
if (FalseMBB && FalseMBB->pred_size() > 1) {
FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
++Cloned;
}
mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
++numIfPatternMatch;
numClonedBlock += Cloned;
return 1 + Cloned + NumMatch;
}
int AMDGPUCFGStructurizer::loopendPatternMatch() {
std::deque<MachineLoop *> NestedLoops;
for (auto &It: *MLI)
for (MachineLoop *ML : depth_first(It))
NestedLoops.push_front(ML);
if (NestedLoops.empty())
return 0;
// Process nested loop outside->inside (we did push_front),
// so "continue" to a outside loop won't be mistaken as "break"
// of the current loop.
int Num = 0;
for (MachineLoop *ExaminedLoop : NestedLoops) {
if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
continue;
DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
int NumBreak = mergeLoop(ExaminedLoop);
if (NumBreak == -1)
break;
Num += NumBreak;
}
return Num;
}
int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
MBBVector ExitingMBBs;
LoopRep->getExitingBlocks(ExitingMBBs);
assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
// We assume a single ExitBlk
MBBVector ExitBlks;
LoopRep->getExitBlocks(ExitBlks);
SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
ExitBlkSet.insert(ExitBlks[i]);
assert(ExitBlkSet.size() == 1);
MachineBasicBlock *ExitBlk = *ExitBlks.begin();
assert(ExitBlk && "Loop has several exit block");
MBBVector LatchBlks;
for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
if (LoopRep->contains(LB))
LatchBlks.push_back(LB);
for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
settleLoopcontBlock(LatchBlks[i], LoopHeader);
int Match = 0;
do {
Match = 0;
Match += serialPatternMatch(LoopHeader);
Match += ifPatternMatch(LoopHeader);
} while (Match > 0);
mergeLooplandBlock(LoopHeader, ExitBlk);
MachineLoop *ParentLoop = LoopRep->getParentLoop();
if (ParentLoop)
MLI->changeLoopFor(LoopHeader, ParentLoop);
else
MLI->removeBlock(LoopHeader);
Visited[LoopRep] = true;
return 1;
}
bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
if (Src1MBB->succ_size() == 0) {
MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
if (TheEntry) {
DEBUG(
dbgs() << "isLoopContBreakBlock yes src1 = BB"
<< Src1MBB->getNumber()
<< " src2 = BB" << Src2MBB->getNumber() << "\n";
);
return true;
}
}
}
return false;
}
int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
if (Num == 0) {
DEBUG(
dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
);
Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
}
return Num;
}
int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
int Num = 0;
MachineBasicBlock *DownBlk;
//trueBlk could be the common post dominator
DownBlk = TrueMBB;
DEBUG(
dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
<< " true = BB" << TrueMBB->getNumber()
<< ", numSucc=" << TrueMBB->succ_size()
<< " false = BB" << FalseMBB->getNumber() << "\n";
);
while (DownBlk) {
DEBUG(
dbgs() << "check down = BB" << DownBlk->getNumber();
);
if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
DEBUG(
dbgs() << " working\n";
);
Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
numClonedBlock += Num;
Num += serialPatternMatch(*HeadMBB->succ_begin());
Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
Num += ifPatternMatch(HeadMBB);
assert(Num > 0);
break;
}
DEBUG(
dbgs() << " not working\n";
);
DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
} // walk down the postDomTree
return Num;
}
#ifndef NDEBUG
void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
dbgs() << "head = BB" << HeadMBB->getNumber()
<< " size = " << HeadMBB->size();
if (Detail) {
dbgs() << "\n";
HeadMBB->print(dbgs());
dbgs() << "\n";
}
if (TrueMBB) {
dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
<< TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
if (Detail) {
dbgs() << "\n";
TrueMBB->print(dbgs());
dbgs() << "\n";
}
}
if (FalseMBB) {
dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
<< FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
if (Detail) {
dbgs() << "\n";
FalseMBB->print(dbgs());
dbgs() << "\n";
}
}
if (LandMBB) {
dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
<< LandMBB->size() << " numPred = " << LandMBB->pred_size();
if (Detail) {
dbgs() << "\n";
LandMBB->print(dbgs());
dbgs() << "\n";
}
}
dbgs() << "\n";
}
#endif
int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock **LandMBBPtr) {
bool MigrateTrue = false;
bool MigrateFalse = false;
MachineBasicBlock *LandBlk = *LandMBBPtr;
assert((!TrueMBB || TrueMBB->succ_size() <= 1)
&& (!FalseMBB || FalseMBB->succ_size() <= 1));
if (TrueMBB == FalseMBB)
return 0;
MigrateTrue = needMigrateBlock(TrueMBB);
MigrateFalse = needMigrateBlock(FalseMBB);
if (!MigrateTrue && !MigrateFalse)
return 0;
// If we need to migrate either trueBlk and falseBlk, migrate the rest that
// have more than one predecessors. without doing this, its predecessor
// rather than headBlk will have undefined value in initReg.
if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
MigrateTrue = true;
if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
MigrateFalse = true;
DEBUG(
dbgs() << "before improveSimpleJumpintoIf: ";
showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
);
// org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
//
// new: headBlk => if () {initReg = 1; org trueBlk branch} else
// {initReg = 0; org falseBlk branch }
// => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
// => org landBlk
// if landBlk->pred_size() > 2, put the about if-else inside
// if (initReg !=2) {...}
//
// add initReg = initVal to headBlk
const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
if (!MigrateTrue || !MigrateFalse) {
// XXX: We have an opportunity here to optimize the "branch into if" case
// here. Branch into if looks like this:
// entry
// / |
// diamond_head branch_from
// / \ |
// diamond_false diamond_true
// \ /
// done
//
// The diamond_head block begins the "if" and the diamond_true block
// is the block being "branched into".
//
// If MigrateTrue is true, then TrueBB is the block being "branched into"
// and if MigrateFalse is true, then FalseBB is the block being
// "branched into"
//
// Here is the pseudo code for how I think the optimization should work:
// 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
// 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
// 3. Move the branch instruction from diamond_head into its own basic
// block (new_block).
// 4. Add an unconditional branch from diamond_head to new_block
// 5. Replace the branch instruction in branch_from with an unconditional
// branch to new_block. If branch_from has multiple predecessors, then
// we need to replace the True/False block in the branch
// instruction instead of replacing it.
// 6. Change the condition of the branch instruction in new_block from
// COND to (COND || GPR0)
//
// In order insert these MOV instruction, we will need to use the
// RegisterScavenger. Usually liveness stops being tracked during
// the late machine optimization passes, however if we implement
// bool TargetRegisterInfo::requiresRegisterScavenging(
// const MachineFunction &MF)
// and have it return true, liveness will be tracked correctly
// by generic optimization passes. We will also need to make sure that
// all of our target-specific passes that run after regalloc and before
// the CFGStructurizer track liveness and we will need to modify this pass
// to correctly track liveness.
//
// After the above changes, the new CFG should look like this:
// entry
// / |
// diamond_head branch_from
// \ /
// new_block
// / |
// diamond_false diamond_true
// \ /
// done
//
// Without this optimization, we are forced to duplicate the diamond_true
// block and we will end up with a CFG like this:
//
// entry
// / |
// diamond_head branch_from
// / \ |
// diamond_false diamond_true diamond_true (duplicate)
// \ / |
// done --------------------|
//
// Duplicating diamond_true can be very costly especially if it has a
// lot of instructions.
return 0;
}
int NumNewBlk = 0;
bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
//insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
if (LandBlkHasOtherPred) {
report_fatal_error("Extra register needed to handle CFG");
unsigned CmpResReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
report_fatal_error("Extra compare instruction needed to handle CFG");
insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
CmpResReg, DebugLoc());
}
// XXX: We are running this after RA, so creating virtual registers will
// cause an assertion failure in the PostRA scheduling pass.
unsigned InitReg =
HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
DebugLoc());
if (MigrateTrue) {
migrateInstruction(TrueMBB, LandBlk, I);
// need to uncondionally insert the assignment to ensure a path from its
// predecessor rather than headBlk has valid value in initReg if
// (initVal != 1).
report_fatal_error("Extra register needed to handle CFG");
}
insertInstrBefore(I, AMDGPU::ELSE);
if (MigrateFalse) {
migrateInstruction(FalseMBB, LandBlk, I);
// need to uncondionally insert the assignment to ensure a path from its
// predecessor rather than headBlk has valid value in initReg if
// (initVal != 0)
report_fatal_error("Extra register needed to handle CFG");
}
if (LandBlkHasOtherPred) {
// add endif
insertInstrBefore(I, AMDGPU::ENDIF);
// put initReg = 2 to other predecessors of landBlk
for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
PE = LandBlk->pred_end(); PI != PE; ++PI) {
MachineBasicBlock *MBB = *PI;
if (MBB != TrueMBB && MBB != FalseMBB)
report_fatal_error("Extra register needed to handle CFG");
}
}
DEBUG(
dbgs() << "result from improveSimpleJumpintoIf: ";
showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
);
// update landBlk
*LandMBBPtr = LandBlk;
return NumNewBlk;
}
void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
MachineBasicBlock *SrcMBB) {
DEBUG(
dbgs() << "serialPattern BB" << DstMBB->getNumber()
<< " <= BB" << SrcMBB->getNumber() << "\n";
);
DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
DstMBB->removeSuccessor(SrcMBB, true);
cloneSuccessorList(DstMBB, SrcMBB);
removeSuccessor(SrcMBB);
MLI->removeBlock(SrcMBB);
retireBlock(SrcMBB);
}
void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
assert (TrueMBB);
DEBUG(
dbgs() << "ifPattern BB" << MBB->getNumber();
dbgs() << "{ ";
if (TrueMBB) {
dbgs() << "BB" << TrueMBB->getNumber();
}
dbgs() << " } else ";
dbgs() << "{ ";
if (FalseMBB) {
dbgs() << "BB" << FalseMBB->getNumber();
}
dbgs() << " }\n ";
dbgs() << "landBlock: ";
if (!LandMBB) {
dbgs() << "NULL";
} else {
dbgs() << "BB" << LandMBB->getNumber();
}
dbgs() << "\n";
);
int OldOpcode = BranchMI->getOpcode();
DebugLoc BranchDL = BranchMI->getDebugLoc();
// transform to
// if cond
// trueBlk
// else
// falseBlk
// endif
// landBlk
MachineBasicBlock::iterator I = BranchMI;
insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
BranchDL);
if (TrueMBB) {
MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
MBB->removeSuccessor(TrueMBB, true);
if (LandMBB && TrueMBB->succ_size()!=0)
TrueMBB->removeSuccessor(LandMBB, true);
retireBlock(TrueMBB);
MLI->removeBlock(TrueMBB);
}
if (FalseMBB) {
insertInstrBefore(I, AMDGPU::ELSE);
MBB->splice(I, FalseMBB, FalseMBB->begin(),
FalseMBB->end());
MBB->removeSuccessor(FalseMBB, true);
if (LandMBB && FalseMBB->succ_size() != 0)
FalseMBB->removeSuccessor(LandMBB, true);
retireBlock(FalseMBB);
MLI->removeBlock(FalseMBB);
}
insertInstrBefore(I, AMDGPU::ENDIF);
BranchMI->eraseFromParent();
if (LandMBB && TrueMBB && FalseMBB)
MBB->addSuccessor(LandMBB);
}
void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
MachineBasicBlock *LandMBB) {
DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
<< " land = BB" << LandMBB->getNumber() << "\n";);
insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
MachineBasicBlock *LandMBB) {
DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
<< " land = BB" << LandMBB->getNumber() << "\n";);
MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
assert(BranchMI && isCondBranch(BranchMI));
DebugLoc DL = BranchMI->getDebugLoc();
MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
MachineBasicBlock::iterator I = BranchMI;
if (TrueBranch != LandMBB)
reversePredicateSetter(I, *I->getParent());
insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
insertInstrBefore(I, AMDGPU::BREAK);
insertInstrBefore(I, AMDGPU::ENDIF);
//now branchInst can be erase safely
BranchMI->eraseFromParent();
//now take care of successors, retire blocks
ExitingMBB->removeSuccessor(LandMBB, true);
}
void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
MachineBasicBlock *ContMBB) {
DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
<< ContingMBB->getNumber()
<< ", cont = BB" << ContMBB->getNumber() << "\n";);
MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
if (MI) {
assert(isCondBranch(MI));
MachineBasicBlock::iterator I = MI;
MachineBasicBlock *TrueBranch = getTrueBranch(MI);
int OldOpcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
if (!UseContinueLogical) {
int BranchOpcode =
TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
getBranchZeroOpcode(OldOpcode);
insertCondBranchBefore(I, BranchOpcode, DL);
// insertEnd to ensure phi-moves, if exist, go before the continue-instr.
insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
} else {
int BranchOpcode =
TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
getContinueZeroOpcode(OldOpcode);
insertCondBranchBefore(I, BranchOpcode, DL);
}
MI->eraseFromParent();
} else {
// if we've arrived here then we've already erased the branch instruction
// travel back up the basic block to see the last reference of our debug
// location we've just inserted that reference here so it should be
// representative insertEnd to ensure phi-moves, if exist, go before the
// continue-instr.
insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
getLastDebugLocInBB(ContingMBB));
}
}
int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
int Cloned = 0;
assert(PreMBB->isSuccessor(SrcMBB));
while (SrcMBB && SrcMBB != DstMBB) {
assert(SrcMBB->succ_size() == 1);
if (SrcMBB->pred_size() > 1) {
SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
++Cloned;
}
PreMBB = SrcMBB;
SrcMBB = *SrcMBB->succ_begin();
}
return Cloned;
}
MachineBasicBlock *
AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
MachineBasicBlock *PredMBB) {
assert(PredMBB->isSuccessor(MBB) &&
"succBlk is not a prececessor of curBlk");
MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions
replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
//srcBlk, oldBlk, newBlk
PredMBB->replaceSuccessor(MBB, CloneMBB);
// add all successor to cloneBlk
cloneSuccessorList(CloneMBB, MBB);
numClonedInstr += MBB->size();
DEBUG(
dbgs() << "Cloned block: " << "BB"
<< MBB->getNumber() << "size " << MBB->size() << "\n";
);
SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
return CloneMBB;
}
void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
MachineBasicBlock::iterator SpliceEnd;
//look for the input branchinstr, not the AMDGPU branchinstr
MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
if (!BranchMI) {
DEBUG(
dbgs() << "migrateInstruction don't see branch instr\n";
);
SpliceEnd = SrcMBB->end();
} else {
DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
SpliceEnd = BranchMI;
}
DEBUG(
dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
<< "srcSize = " << SrcMBB->size() << "\n";
);
//splice insert before insertPos
DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
DEBUG(
dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
<< "srcSize = " << SrcMBB->size() << '\n';
);
}
MachineBasicBlock *
AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
MachineBasicBlock *LoopHeader = LoopRep->getHeader();
MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
if (!LoopHeader || !LoopLatch)
return nullptr;
MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
// Is LoopRep an infinite loop ?
if (!BranchMI || !isUncondBranch(BranchMI))
return nullptr;
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
Ctx.emitError("Extra register needed to handle CFG");
return nullptr;
}
void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
MachineInstr *BranchMI;
// I saw two unconditional branch in one basic block in example
// test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
while ((BranchMI = getLoopendBlockBranchInstr(MBB))
&& isUncondBranch(BranchMI)) {
DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
}
}
void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MachineBasicBlock *MBB) {
if (MBB->succ_size() != 2)
return;
MachineBasicBlock *MBB1 = *MBB->succ_begin();
MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
if (MBB1 != MBB2)
return;
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
assert(BranchMI && isCondBranch(BranchMI));
DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
SHOWNEWBLK(MBB1, "Removing redundant successor");
MBB->removeSuccessor(MBB1, true);
}
void AMDGPUCFGStructurizer::addDummyExitBlock(
SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
FuncRep->push_back(DummyExitBlk); //insert to function
insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
E = RetMBB.end(); It != E; ++It) {
MachineBasicBlock *MBB = *It;
MachineInstr *MI = getReturnInstr(MBB);
if (MI)
MI->eraseFromParent();
MBB->addSuccessor(DummyExitBlk);
DEBUG(
dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
<< " successors\n";
);
}
SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
}
void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
while (MBB->succ_size())
MBB->removeSuccessor(*MBB->succ_begin());
}
void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
int SccNum) {
BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
if (!srcBlkInfo)
srcBlkInfo = new BlockInformation();
srcBlkInfo->SccNum = SccNum;
}
void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
DEBUG(
dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
);
BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
if (!SrcBlkInfo)
SrcBlkInfo = new BlockInformation();
SrcBlkInfo->IsRetired = true;
assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
&& "can't retire block yet");
}
INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
"AMDGPU CFG Structurizer", false, false)
FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
return new AMDGPUCFGStructurizer();
}