[DeadStoreElimination] Add support for non-local DSE.

We extend the search for redundant stores to predecessor blocks that
unconditionally lead to the block BB with the current store instruction.  That
also includes single-block loops that unconditionally lead to BB, and
if-then-else blocks where then- and else-blocks unconditionally lead to BB.

http://reviews.llvm.org/D13363
Patch by Ivan Baev <ibaev@codeaurora.org>!

llvm-svn: 255247
This commit is contained in:
Chad Rosier 2015-12-10 13:51:43 +00:00
parent ac8d01add0
commit 533bc3fcac
6 changed files with 392 additions and 89 deletions

View File

@ -7,11 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
// This file implements a trivial dead store elimination that only considers
// basic-block local redundant stores.
//
// FIXME: This should eventually be extended to be a post-dominator tree
// traversal. Doing so would be pretty trivial.
// This file implements dead store elimination that considers redundant stores
// within a basic-block as well as across basic blocks in a reverse CFG order.
//
//===----------------------------------------------------------------------===//
@ -44,6 +41,13 @@ using namespace llvm;
STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
STATISTIC(NumFastStores, "Number of stores deleted");
STATISTIC(NumFastOther , "Number of other instrs removed");
STATISTIC(NumNonLocalStores, "Number of non-local stores deleted");
static cl::opt<bool> EnableNonLocalDSE("enable-nonlocal-dse", cl::init(true));
/// MaxNonLocalAttempts is an arbitrary threshold that provides
/// an early opportunitiy for bail out to control compile time.
static const unsigned MaxNonLocalAttempts = 100;
namespace {
struct DSE : public FunctionPass {
@ -80,6 +84,7 @@ namespace {
bool runOnBasicBlock(BasicBlock &BB);
bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
bool HandleFree(CallInst *F);
bool handleNonLocalDependency(Instruction *Inst);
bool handleEndBlock(BasicBlock &BB);
void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
SmallSetVector<Value *, 16> &DeadStackObjects,
@ -485,6 +490,7 @@ static bool isPossibleSelfRead(Instruction *Inst,
bool DSE::runOnBasicBlock(BasicBlock &BB) {
const DataLayout &DL = BB.getModule()->getDataLayout();
bool MadeChange = false;
unsigned NumNonLocalAttempts = 0;
// Do a top-down walk on the BB.
for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
@ -554,99 +560,101 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
MemDepResult InstDep = MD->getDependency(Inst);
// Ignore any store where we can't find a local dependence.
// FIXME: cross-block DSE would be fun. :)
if (!InstDep.isDef() && !InstDep.isClobber())
continue;
if (InstDep.isDef() || InstDep.isClobber()) {
// Figure out what location is being stored to.
MemoryLocation Loc = getLocForWrite(Inst, *AA);
// Figure out what location is being stored to.
MemoryLocation Loc = getLocForWrite(Inst, *AA);
// If we didn't get a useful location, fail.
if (!Loc.Ptr)
continue;
// If we didn't get a useful location, fail.
if (!Loc.Ptr)
continue;
while (InstDep.isDef() || InstDep.isClobber()) {
// Get the memory clobbered by the instruction we depend on. MemDep will
// skip any instructions that 'Loc' clearly doesn't interact with. If we
// end up depending on a may- or must-aliased load, then we can't optimize
// away the store and we bail out. However, if we depend on on something
// that overwrites the memory location we *can* potentially optimize it.
//
// Find out what memory location the dependent instruction stores.
Instruction *DepWrite = InstDep.getInst();
MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
// If we didn't get a useful location, or if it isn't a size, bail out.
if (!DepLoc.Ptr)
break;
// If we find a write that is a) removable (i.e., non-volatile), b) is
// completely obliterated by the store to 'Loc', and c) which we know that
// 'Inst' doesn't load from, then we can remove it.
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR =
isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
if (OR == OverwriteComplete) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
<< *DepWrite << "\n KILLER: " << *Inst << '\n');
// Delete the store and now-dead instructions that feed it.
DeleteDeadInstruction(DepWrite, *MD, *TLI);
++NumFastStores;
MadeChange = true;
// DeleteDeadInstruction can delete the current instruction in loop
// cases, reset BBI.
BBI = Inst->getIterator();
if (BBI != BB.begin())
--BBI;
while (InstDep.isDef() || InstDep.isClobber()) {
// Get the memory clobbered by the instruction we depend on. MemDep
// will skip any instructions that 'Loc' clearly doesn't interact with.
// If we end up depending on a may- or must-aliased load, then we can't
// optimize away the store and we bail out. However, if we depend on on
// something that overwrites the memory location we *can* potentially
// optimize it.
//
// Find out what memory location the dependent instruction stores.
Instruction *DepWrite = InstDep.getInst();
MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
// If we didn't get a useful location, or if it isn't a size, bail out.
if (!DepLoc.Ptr)
break;
} else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
// TODO: base this on the target vector size so that if the earlier
// store was too small to get vector writes anyway then its likely
// a good idea to shorten it
// Power of 2 vector writes are probably always a bad idea to optimize
// as any store/memset/memcpy is likely using vector instructions so
// shortening it to not vector size is likely to be slower
MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
unsigned DepWriteAlign = DepIntrinsic->getAlignment();
if (llvm::isPowerOf2_64(InstWriteOffset) ||
((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: "
<< *DepWrite << "\n KILLER (offset "
<< InstWriteOffset << ", "
<< DepLoc.Size << ")"
<< *Inst << '\n');
// If we find a write that is a) removable (i.e., non-volatile), b) is
// completely obliterated by the store to 'Loc', and c) which we know
// that 'Inst' doesn't load from, then we can remove it.
if (isRemovable(DepWrite) &&
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI,
DepWriteOffset, InstWriteOffset);
if (OR == OverwriteComplete) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
<< "\n KILLER: " << *Inst << '\n');
Value* DepWriteLength = DepIntrinsic->getLength();
Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
InstWriteOffset -
DepWriteOffset);
DepIntrinsic->setLength(TrimmedLength);
// Delete the store and now-dead instructions that feed it.
DeleteDeadInstruction(DepWrite, *MD, *TLI);
++NumFastStores;
MadeChange = true;
// DeleteDeadInstruction can delete the current instruction in loop
// cases, reset BBI.
BBI = Inst->getIterator();
if (BBI != BB.begin())
--BBI;
break;
} else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
// TODO: base this on the target vector size so that if the earlier
// store was too small to get vector writes anyway then its likely a
// good idea to shorten it.
// Power of 2 vector writes are probably always a bad idea to
// optimize as any store/memset/memcpy is likely using vector
// instructions so shortening it to not vector size is likely to be
// slower.
MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite);
unsigned DepWriteAlign = DepIntrinsic->getAlignment();
if (llvm::isPowerOf2_64(InstWriteOffset) ||
((DepWriteAlign != 0) &&
InstWriteOffset % DepWriteAlign == 0)) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " << *DepWrite
<< "\n KILLER (offset " << InstWriteOffset << ", "
<< DepLoc.Size << ")" << *Inst << '\n');
Value *DepWriteLength = DepIntrinsic->getLength();
Value *TrimmedLength = ConstantInt::get(
DepWriteLength->getType(), InstWriteOffset - DepWriteOffset);
DepIntrinsic->setLength(TrimmedLength);
MadeChange = true;
}
}
}
// If this is a may-aliased store that is clobbering the store value, we
// can keep searching past it for another must-aliased pointer that
// stores to the same location. For example, in
// store -> P
// store -> Q
// store -> P
// we can remove the first store to P even though we don't know if P and
// Q alias.
if (DepWrite == &BB.front())
break;
// Can't look past this instruction if it might read 'Loc'.
if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
break;
InstDep = MD->getPointerDependencyFrom(Loc, false,
DepWrite->getIterator(), &BB);
}
// If this is a may-aliased store that is clobbering the store value, we
// can keep searching past it for another must-aliased pointer that stores
// to the same location. For example, in:
// store -> P
// store -> Q
// store -> P
// we can remove the first store to P even though we don't know if P and Q
// alias.
if (DepWrite == &BB.front()) break;
// Can't look past this instruction if it might read 'Loc'.
if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
break;
InstDep = MD->getPointerDependencyFrom(Loc, false,
DepWrite->getIterator(), &BB);
} else if (EnableNonLocalDSE && InstDep.isNonLocal()) { // DSE across BB
if (++NumNonLocalAttempts < MaxNonLocalAttempts)
MadeChange |= handleNonLocalDependency(Inst);
}
}
@ -658,6 +666,150 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
return MadeChange;
}
/// A helper for handleNonLocalDependency() function to find all blocks
/// that lead to the input block BB and append them to the output PredBlocks.
/// PredBlocks will include not only predecessors of BB that unconditionally
/// lead to BB but also:
/// - single-block loops that lead to BB, and
/// - if-blocks for which one edge goes to BB and the other edge goes to
/// a block in the input SafeBlocks.
/// PredBlocks will not include blocks unreachable from the entry block, nor
/// blocks that form cycles with BB.
static void findSafePreds(SmallVectorImpl<BasicBlock *> &PredBlocks,
SmallSetVector<BasicBlock *, 8> &SafeBlocks,
BasicBlock *BB, DominatorTree *DT) {
for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
BasicBlock *Pred = *I;
if (Pred == BB)
continue;
// The second check below prevents adding blocks that form a cycle with BB
// in order to avoid potential problems due to MemoryDependenceAnalysis,
// isOverwrite, etc. being not loop-aware.
if (!DT->isReachableFromEntry(Pred) || DT->dominates(BB, Pred))
continue;
bool PredIsSafe = true;
for (auto II = succ_begin(Pred), EE = succ_end(Pred); II != EE; ++II) {
BasicBlock *Succ = *II;
if (Succ == BB || Succ == Pred) // shortcut, BB should be in SafeBlocks
continue;
if (!SafeBlocks.count(Succ)) {
PredIsSafe = false;
break;
}
}
if (PredIsSafe)
PredBlocks.push_back(Pred);
}
}
static bool underlyingObjectsDoNotAlias(StoreInst *SI, LoadInst *LI,
const DataLayout &DL,
AliasAnalysis &AA) {
Value *AObj = GetUnderlyingObject(SI->getPointerOperand(), DL);
SmallVector<Value *, 4> Pointers;
GetUnderlyingObjects(LI->getPointerOperand(), Pointers, DL);
for (auto I = Pointers.begin(), E = Pointers.end(); I != E; ++I) {
Value *BObj = *I;
if (!AA.isNoAlias(AObj, DL.getTypeStoreSize(AObj->getType()), BObj,
DL.getTypeStoreSize(BObj->getType())))
return false;
}
return true;
}
/// handleNonLocalDependency - Handle a non-local dependency on
/// the input instruction Inst located in BB in attempt to remove
/// redundant stores outside BB.
bool DSE::handleNonLocalDependency(Instruction *Inst) {
auto *SI = dyn_cast<StoreInst>(Inst);
if (!SI)
return false;
// Get the location being stored to.
// If we don't get a useful location, bail out.
MemoryLocation Loc = getLocForWrite(SI, *AA);
if (!Loc.Ptr)
return false;
bool MadeChange = false;
BasicBlock *BB = Inst->getParent();
const DataLayout &DL = BB->getModule()->getDataLayout();
// Worklist of predecessor blocks of BB
SmallVector<BasicBlock *, 8> Blocks;
// Keep track of all predecessor blocks that are safe to search through
SmallSetVector<BasicBlock *, 8> SafeBlocks;
SafeBlocks.insert(BB);
findSafePreds(Blocks, SafeBlocks, BB, DT);
while (!Blocks.empty()) {
BasicBlock *PB = Blocks.pop_back_val();
MemDepResult Dep =
MD->getPointerDependencyFrom(Loc, false, PB->end(), PB, SI);
while (Dep.isDef() || Dep.isClobber()) {
Instruction *Dependency = Dep.getInst();
// Filter out false dependency from a load to SI looking through phis.
if (auto *LI = dyn_cast<LoadInst>(Dependency)) {
if (underlyingObjectsDoNotAlias(SI, LI, DL, *AA)) {
Dep = MD->getPointerDependencyFrom(Loc, false,
Dependency->getIterator(), PB, SI);
continue;
}
}
// If we don't get a useful location for the dependent instruction,
// it doesn't write memory, it is not removable, or it might read Loc,
// then bail out.
MemoryLocation DepLoc = getLocForWrite(Dependency, *AA);
if (!DepLoc.Ptr || !hasMemoryWrite(Dependency, *TLI) ||
!isRemovable(Dependency) ||
(AA->getModRefInfo(Dependency, Loc) & MRI_Ref))
break;
// Don't remove a store within single-block loops;
// we need more analysis: e.g. looking for an interferring load
// above the store within the loop, etc.
bool SingleBlockLoop = false;
for (auto I = succ_begin(PB), E = succ_end(PB); I != E; ++I) {
BasicBlock *Succ = *I;
if (Succ == PB) {
SingleBlockLoop = true;
break;
}
}
if (SingleBlockLoop)
break;
int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR =
isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
if (OR == OverwriteComplete) {
DEBUG(dbgs() << "DSE: Remove Non-Local Dead Store:\n DEAD: "
<< *Dependency << "\n KILLER: " << *SI << '\n');
// Delete redundant store and now-dead instructions that feed it.
auto Next = std::next(Dependency->getIterator());
DeleteDeadInstruction(Dependency, *MD, *TLI);
++NumNonLocalStores;
MadeChange = true;
Dep = MD->getPointerDependencyFrom(Loc, false, Next, PB, SI);
continue;
}
// TODO: attempt shortening of Dependency inst as in the local case
break;
}
if (Dep.isNonLocal()) {
SafeBlocks.insert(PB);
findSafePreds(Blocks, SafeBlocks, PB, DT);
}
}
return MadeChange;
}
/// Returns true if the memory which is accessed by the second instruction is not
/// modified between the first and the second instruction.
/// Precondition: Second instruction must be dominated by the first

View File

@ -0,0 +1,26 @@
; RUN: opt < %s -basicaa -dse -S | FileCheck %s
@Table = global [535 x i32] zeroinitializer, align 4
; The store in for.inc block should NOT be removed by non-local DSE.
; CHECK: store i32 64, i32* %arrayidx
;
define void @foo() {
entry:
br label %for.body
for.body:
%i = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%arrayidx = getelementptr inbounds [535 x i32], [535 x i32]* @Table, i32 0, i32 %i
store i32 %i, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %i, 64
br i1 %cmp1, label %for.inc, label %for.end
for.inc:
store i32 64, i32* %arrayidx, align 4
%inc = add nsw i32 %i, 1
br label %for.body
for.end:
ret void
}

View File

@ -0,0 +1,22 @@
; RUN: opt < %s -basicaa -dse -S | FileCheck %s
; The store and add in if.then block should be removed by non-local DSE.
; CHECK-NOT: %stval = add
; CHECK-NOT: store i32 %stval
;
define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
entry:
%cmp = icmp sgt i32 %c, 0
br i1 %cmp, label %if.then, label %if.end
if.then:
%0 = load i32, i32* %b, align 4
%stval = add nsw i32 %0, 1
store i32 %stval, i32* %a, align 4
br label %if.end
if.end:
%m.0 = phi i32 [ 13, %if.then ], [ 10, %entry ]
store i32 %m.0, i32* %a, align 4
ret void
}

View File

@ -0,0 +1,27 @@
; RUN: opt < %s -basicaa -dse -S | FileCheck %s
; The add and store in entry block should be removed by non-local DSE.
; CHECK-NOT: %stval = add
; CHECK-NOT: store i32 %stval
;
define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
entry:
%0 = load i32, i32* %b, align 4
%stval = add nsw i32 %0, 1
store i32 %stval, i32* %a, align 4
%cmp = icmp sgt i32 %c, 0
br i1 %cmp, label %if.then, label %if.else
if.then:
%1 = add nsw i32 %c, 10
br label %if.end
if.else:
%2 = add nsw i32 %c, 13
br label %if.end
if.end:
%3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
store i32 %3, i32* %a, align 4
ret void
}

View File

@ -0,0 +1,34 @@
; RUN: opt < %s -basicaa -dse -S | FileCheck %s
; The add and store in entry block should be removed by non-local DSE.
; CHECK-NOT: %stval = add
; CHECK-NOT: store i32 %stval
;
; The stores in if.then and if.else blocks should be removed by non-local DSE.
; CHECK-NOT: store i32 %1
; CHECK-NOT: store i32 %2
;
define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
entry:
%0 = load i32, i32* %b, align 4
%stval = add nsw i32 %0, 1
store i32 %stval, i32* %a, align 4
%cmp = icmp sgt i32 %c, 0
br i1 %cmp, label %if.then, label %if.else
if.then:
%1 = add nsw i32 %c, 10
store i32 %1, i32* %a, align 4
br label %if.end
if.else:
%2 = add nsw i32 %c, 13
store i32 %2, i32* %a, align 4
br label %if.end
if.end:
%3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
%4 = sub nsw i32 %3, 6
store i32 %4, i32* %a, align 4
ret void
}

View File

@ -0,0 +1,42 @@
; RUN: opt < %s -basicaa -dse -S | FileCheck %s
; The store in for.body block should be removed by non-local DSE.
; CHECK-NOT: store i32 0, i32* %arrayidx
;
define void @sum(i32 %N, i32* noalias nocapture %C, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B) {
entry:
%cmp24 = icmp eq i32 %N, 0
br i1 %cmp24, label %for.end11, label %for.body
for.body:
%i.025 = phi i32 [ %inc10, %for.cond1.for.inc9_crit_edge ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %C, i32 %i.025
store i32 0, i32* %arrayidx, align 4
%mul = mul i32 %i.025, %N
%arrayidx4.gep = getelementptr i32, i32* %A, i32 %mul
br label %for.body3
for.body3:
%0 = phi i32 [ 0, %for.body ], [ %add8, %for.body3 ]
%arrayidx4.phi = phi i32* [ %arrayidx4.gep, %for.body ], [ %arrayidx4.inc, %for.body3 ]
%arrayidx5.phi = phi i32* [ %B, %for.body ], [ %arrayidx5.inc, %for.body3 ]
%j.023 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
%1 = load i32, i32* %arrayidx4.phi, align 4
%2 = load i32, i32* %arrayidx5.phi, align 4
%add6 = add nsw i32 %2, %1
%add8 = add nsw i32 %add6, %0
%inc = add i32 %j.023, 1
%exitcond = icmp ne i32 %inc, %N
%arrayidx4.inc = getelementptr i32, i32* %arrayidx4.phi, i32 1
%arrayidx5.inc = getelementptr i32, i32* %arrayidx5.phi, i32 1
br i1 %exitcond, label %for.body3, label %for.cond1.for.inc9_crit_edge
for.cond1.for.inc9_crit_edge:
store i32 %add8, i32* %arrayidx, align 4
%inc10 = add i32 %i.025, 1
%exitcond26 = icmp ne i32 %inc10, %N
br i1 %exitcond26, label %for.body, label %for.end11
for.end11:
ret void
}