[ConstHoisting] Add BFI in constanthoisting pass and select the best insertion

places based on it.

Existing constant hoisting pass will merge a group of contants in a small range
and hoist the const materialization code to the common dominator of their uses.
However, if the uses are all in cold pathes, existing implementation may hoist
the materialization code from cold pathes to a hot place. This may hurt performance.
The patch introduces BFI to the pass and selects the best insertion places based
on it.

The change is controlled by an option consthoist-with-block-frequency which is
off by default for now.

Differential Revision: https://reviews.llvm.org/D28962

llvm-svn: 300989
This commit is contained in:
Wei Mi 2017-04-21 15:50:16 +00:00
parent b42f3c03bf
commit 337d4d95c2
5 changed files with 314 additions and 35 deletions

View File

@ -36,6 +36,7 @@
#ifndef LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H
#define LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/PassManager.h"
@ -98,7 +99,7 @@ public:
// Glue for old PM.
bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
BasicBlock &Entry);
BlockFrequencyInfo *BFI, BasicBlock &Entry);
void releaseMemory() {
ConstantVec.clear();
@ -112,6 +113,7 @@ private:
const TargetTransformInfo *TTI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
BasicBlock *Entry;
/// Keeps track of constant candidates found in the function.
@ -124,8 +126,8 @@ private:
SmallVector<consthoist::ConstantInfo, 8> ConstantVec;
Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
Instruction *findConstantInsertionPoint(
const consthoist::ConstantInfo &ConstInfo) const;
SmallPtrSet<Instruction *, 8>
findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo) const;
void collectConstantCandidates(ConstCandMapType &ConstCandMap,
Instruction *Inst, unsigned Idx,
ConstantInt *ConstInt);

View File

@ -53,6 +53,12 @@ using namespace consthoist;
STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
STATISTIC(NumConstantsRebased, "Number of constants rebased");
static cl::opt<bool> ConstHoistWithBlockFrequency(
"consthoist-with-block-frequency", cl::init(false), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to reduce the "
"chance to execute const materialization more frequently than "
"without hoisting."));
namespace {
/// \brief The constant hoisting pass.
class ConstantHoistingLegacyPass : public FunctionPass {
@ -68,6 +74,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
if (ConstHoistWithBlockFrequency)
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
@ -82,6 +90,7 @@ private:
char ConstantHoistingLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
@ -99,9 +108,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
bool MadeChange = Impl.runImpl(
Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock());
bool MadeChange =
Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
ConstHoistWithBlockFrequency
? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
: nullptr,
Fn.getEntryBlock());
if (MadeChange) {
DEBUG(dbgs() << "********** Function after Constant Hoisting: "
@ -148,33 +161,142 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
return IDom->getBlock()->getTerminator();
}
/// \brief Given \p BBs as input, find another set of BBs which collectively
/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
/// set found in \p BBs.
void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
BasicBlock *Entry,
SmallPtrSet<BasicBlock *, 8> &BBs) {
assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
// Nodes on the current path to the root.
SmallPtrSet<BasicBlock *, 8> Path;
// Candidates includes any block 'BB' in set 'BBs' that is not strictly
// dominated by any other blocks in set 'BBs', and all nodes in the path
// in the dominator tree from Entry to 'BB'.
SmallPtrSet<BasicBlock *, 16> Candidates;
for (auto BB : BBs) {
Path.clear();
// Walk up the dominator tree until Entry or another BB in BBs
// is reached. Insert the nodes on the way to the Path.
BasicBlock *Node = BB;
// The "Path" is a candidate path to be added into Candidates set.
bool isCandidate = false;
do {
Path.insert(Node);
if (Node == Entry || Candidates.count(Node)) {
isCandidate = true;
break;
}
assert(DT.getNode(Node)->getIDom() &&
"Entry doens't dominate current Node");
Node = DT.getNode(Node)->getIDom()->getBlock();
} while (!BBs.count(Node));
// If isCandidate is false, Node is another Block in BBs dominating
// current 'BB'. Drop the nodes on the Path.
if (!isCandidate)
continue;
// Add nodes on the Path into Candidates.
Candidates.insert(Path.begin(), Path.end());
}
// Sort the nodes in Candidates in top-down order and save the nodes
// in Orders.
unsigned Idx = 0;
SmallVector<BasicBlock *, 16> Orders;
Orders.push_back(Entry);
while (Idx != Orders.size()) {
BasicBlock *Node = Orders[Idx++];
for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
if (Candidates.count(ChildDomNode->getBlock()))
Orders.push_back(ChildDomNode->getBlock());
}
}
// Visit Orders in bottom-up order.
typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
InsertPtsCostPair;
// InsertPtsMap is a map from a BB to the best insertion points for the
// subtree of BB (subtree not including the BB itself).
DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
InsertPtsMap.reserve(Orders.size() + 1);
for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
BasicBlock *Node = *RIt;
bool NodeInBBs = BBs.count(Node);
SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
// Return the optimal insert points in BBs.
if (Node == Entry) {
BBs.clear();
if (InsertPtsFreq > BFI.getBlockFreq(Node))
BBs.insert(Entry);
else
BBs.insert(InsertPts.begin(), InsertPts.end());
break;
}
BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
// Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
// will update its parent's ParentInsertPts and ParentPtsFreq.
SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
// Choose to insert in Node or in subtree of Node.
if (InsertPtsFreq > BFI.getBlockFreq(Node) || NodeInBBs) {
ParentInsertPts.insert(Node);
ParentPtsFreq += BFI.getBlockFreq(Node);
} else {
ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
ParentPtsFreq += InsertPtsFreq;
}
}
}
/// \brief Find an insertion point that dominates all uses.
Instruction *ConstantHoistingPass::findConstantInsertionPoint(
SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
const ConstantInfo &ConstInfo) const {
assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
// Collect all basic blocks.
SmallPtrSet<BasicBlock *, 8> BBs;
SmallPtrSet<Instruction *, 8> InsertPts;
for (auto const &RCI : ConstInfo.RebasedConstants)
for (auto const &U : RCI.Uses)
BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
if (BBs.count(Entry))
return &Entry->front();
if (BBs.count(Entry)) {
InsertPts.insert(&Entry->front());
return InsertPts;
}
if (BFI) {
findBestInsertionSet(*DT, *BFI, Entry, BBs);
for (auto BB : BBs) {
BasicBlock::iterator InsertPt = BB->begin();
for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
;
InsertPts.insert(&*InsertPt);
}
return InsertPts;
}
while (BBs.size() >= 2) {
BasicBlock *BB, *BB1, *BB2;
BB1 = *BBs.begin();
BB2 = *std::next(BBs.begin());
BB = DT->findNearestCommonDominator(BB1, BB2);
if (BB == Entry)
return &Entry->front();
if (BB == Entry) {
InsertPts.insert(&Entry->front());
return InsertPts;
}
BBs.erase(BB1);
BBs.erase(BB2);
BBs.insert(BB);
}
assert((BBs.size() == 1) && "Expected only one element.");
Instruction &FirstInst = (*BBs.begin())->front();
return findMatInsertPt(&FirstInst);
InsertPts.insert(findMatInsertPt(&FirstInst));
return InsertPts;
}
@ -557,29 +679,54 @@ bool ConstantHoistingPass::emitBaseConstants() {
bool MadeChange = false;
for (auto const &ConstInfo : ConstantVec) {
// Hoist and hide the base constant behind a bitcast.
Instruction *IP = findConstantInsertionPoint(ConstInfo);
IntegerType *Ty = ConstInfo.BaseConstant->getType();
Instruction *Base =
new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
<< IP->getParent()->getName() << '\n' << *Base << '\n');
SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
assert(!IPSet.empty() && "IPSet is empty");
unsigned UsesNum = 0;
unsigned ReBasesNum = 0;
for (Instruction *IP : IPSet) {
IntegerType *Ty = ConstInfo.BaseConstant->getType();
Instruction *Base =
new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
<< ") to BB " << IP->getParent()->getName() << '\n'
<< *Base << '\n');
// Emit materialization code for all rebased constants.
unsigned Uses = 0;
for (auto const &RCI : ConstInfo.RebasedConstants) {
for (auto const &U : RCI.Uses) {
Uses++;
BasicBlock *OrigMatInsertBB =
findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
// If Base constant is to be inserted in multiple places,
// generate rebase for U using the Base dominating U.
if (IPSet.size() == 1 ||
DT->dominates(Base->getParent(), OrigMatInsertBB)) {
emitBaseConstants(Base, RCI.Offset, U);
ReBasesNum++;
}
}
}
UsesNum = Uses;
// Use the same debug location as the last user of the constant.
assert(!Base->use_empty() && "The use list is empty!?");
assert(isa<Instruction>(Base->user_back()) &&
"All uses should be instructions.");
Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
}
(void)UsesNum;
(void)ReBasesNum;
// Expect all uses are rebased after rebase is done.
assert(UsesNum == ReBasesNum && "Not all uses are rebased");
NumConstantsHoisted++;
// Emit materialization code for all rebased constants.
for (auto const &RCI : ConstInfo.RebasedConstants) {
NumConstantsRebased++;
for (auto const &U : RCI.Uses)
emitBaseConstants(Base, RCI.Offset, U);
}
// Base constant is also included in ConstInfo.RebasedConstants, so
// deduct 1 from ConstInfo.RebasedConstants.size().
NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1;
// Use the same debug location as the last user of the constant.
assert(!Base->use_empty() && "The use list is empty!?");
assert(isa<Instruction>(Base->user_back()) &&
"All uses should be instructions.");
Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
// Correct for base constant, which we counted above too.
NumConstantsRebased--;
MadeChange = true;
}
return MadeChange;
@ -595,9 +742,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
/// \brief Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
DominatorTree &DT, BasicBlock &Entry) {
DominatorTree &DT, BlockFrequencyInfo *BFI,
BasicBlock &Entry) {
this->TTI = &TTI;
this->DT = &DT;
this->BFI = BFI;
this->Entry = &Entry;
// Collect all constant candidates.
collectConstantCandidates(Fn);
@ -628,7 +777,10 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
if (!runImpl(F, TTI, DT, F.getEntryBlock()))
auto BFI = ConstHoistWithBlockFrequency
? &AM.getResult<BlockFrequencyAnalysis>(F)
: nullptr;
if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
return PreservedAnalyses::all();
PreservedAnalyses PA;

View File

@ -0,0 +1,115 @@
; RUN: opt -consthoist -mtriple=x86_64-unknown-linux-gnu -consthoist-with-block-frequency=true -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Check when BFI is enabled for constant hoisting, constant 214748364701
; will not be hoisted to the func entry.
; CHECK-LABEL: @foo(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: if.then:
; Function Attrs: norecurse nounwind uwtable
define i64 @foo(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
%cmp = icmp slt i64 %t0, 564
br i1 %cmp, label %if.then, label %if.else5
if.then: ; preds = %entry
%arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5
%t1 = load i64, i64* %arrayidx1, align 8
%cmp2 = icmp slt i64 %t1, 1009
br i1 %cmp2, label %if.then3, label %return
if.then3: ; preds = %if.then
%arrayidx4 = getelementptr inbounds i64, i64* %a, i64 6
%t2 = load i64, i64* %arrayidx4, align 8
%inc = add nsw i64 %t2, 1
store i64 %inc, i64* %arrayidx4, align 8
br label %return
if.else5: ; preds = %entry
%arrayidx6 = getelementptr inbounds i64, i64* %a, i64 6
%t3 = load i64, i64* %arrayidx6, align 8
%cmp7 = icmp slt i64 %t3, 3512
br i1 %cmp7, label %if.then8, label %return
if.then8: ; preds = %if.else5
%arrayidx9 = getelementptr inbounds i64, i64* %a, i64 7
%t4 = load i64, i64* %arrayidx9, align 8
%inc10 = add nsw i64 %t4, 1
store i64 %inc10, i64* %arrayidx9, align 8
br label %return
return: ; preds = %if.else5, %if.then, %if.then8, %if.then3
%retval.0 = phi i64 [ 214748364701, %if.then3 ], [ 214748364701, %if.then8 ], [ 250148364702, %if.then ], [ 256148364704, %if.else5 ]
ret i64 %retval.0
}
; Check when BFI is enabled for constant hoisting, constant 214748364701
; in while.body will be hoisted to while.body.preheader. 214748364701 in
; if.then16 and if.else10 will be merged and hoisted to the beginning of
; if.else10 because if.else10 dominates if.then16.
; CHECK-LABEL: @goo(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: while.body.preheader:
; CHECK-NEXT: bitcast i64 214748364701 to i64
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: if.else10:
; CHECK-NEXT: bitcast i64 214748364701 to i64
; CHECK-NOT: bitcast i64 214748364701 to i64
define i64 @goo(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
%cmp = icmp ult i64 %t0, 56
br i1 %cmp, label %if.then, label %if.else10, !prof !0
if.then: ; preds = %entry
%arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5
%t1 = load i64, i64* %arrayidx1, align 8
%cmp2 = icmp ult i64 %t1, 10
br i1 %cmp2, label %while.cond.preheader, label %return, !prof !0
while.cond.preheader: ; preds = %if.then
%arrayidx7 = getelementptr inbounds i64, i64* %a, i64 6
%t2 = load i64, i64* %arrayidx7, align 8
%cmp823 = icmp ugt i64 %t2, 10000
br i1 %cmp823, label %while.body.preheader, label %return
while.body.preheader: ; preds = %while.cond.preheader
br label %while.body
while.body: ; preds = %while.body.preheader, %while.body
%t3 = phi i64 [ %add, %while.body ], [ %t2, %while.body.preheader ]
%add = add i64 %t3, 214748364701
%cmp8 = icmp ugt i64 %add, 10000
br i1 %cmp8, label %while.body, label %while.cond.return.loopexit_crit_edge
if.else10: ; preds = %entry
%arrayidx11 = getelementptr inbounds i64, i64* %a, i64 6
%t4 = load i64, i64* %arrayidx11, align 8
%add2 = add i64 %t4, 214748364701
%cmp12 = icmp ult i64 %add2, 35
br i1 %cmp12, label %if.then16, label %return, !prof !0
if.then16: ; preds = %if.else10
%arrayidx17 = getelementptr inbounds i64, i64* %a, i64 7
%t5 = load i64, i64* %arrayidx17, align 8
%inc = add i64 %t5, 1
store i64 %inc, i64* %arrayidx17, align 8
br label %return
while.cond.return.loopexit_crit_edge: ; preds = %while.body
store i64 %add, i64* %arrayidx7, align 8
br label %return
return: ; preds = %while.cond.preheader, %while.cond.return.loopexit_crit_edge, %if.else10, %if.then, %if.then16
%retval.0 = phi i64 [ 214748364701, %if.then16 ], [ 0, %if.then ], [ 0, %if.else10 ], [ 0, %while.cond.return.loopexit_crit_edge ], [ 0, %while.cond.preheader ]
ret i64 %retval.0
}
!0 = !{!"branch_weights", i32 1, i32 2000}

View File

@ -7,7 +7,6 @@ target triple = "i386--netbsd"
; CHECK-LABEL: fn1
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
; CHECK: retl

View File

@ -1,4 +1,5 @@
; RUN: opt -S -consthoist < %s | FileCheck %s
; RUN: opt -S -consthoist -consthoist-with-block-frequency=true < %s | FileCheck --check-prefix=BFIHOIST %s
; FIXME: The catchpad doesn't even use the constant, so a better fix would be to
; insert the bitcast in the catchpad block.
@ -11,6 +12,16 @@ target triple = "x86_64-pc-windows-msvc"
; CHECK-NEXT: bitcast i64 9209618997431186100 to i64
; CHECK-NEXT: br i1 %tobool
; BFIHOIST-LABEL: define i32 @main
; BFIHOIST: then:
; BFIHOIST: %[[CONST1:.*]] = bitcast i64 9209618997431186100 to i64
; BFIHOIST: %add = add i64 %call4, %[[CONST1]]
; BFIHOIST: br label %endif
; BFIHOIST: else:
; BFIHOIST: %[[CONST2:.*]] = bitcast i64 9209618997431186100 to i64
; BFIHOIST: %add6 = add i64 %call5, %[[CONST2]]
; BFIHOIST: br label %endif
; Function Attrs: norecurse
define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
%call = tail call i64 @fn(i64 0)