forked from OSchip/llvm-project
[DSE] Remove stores in the same loop iteration
DSE will currently only remove stores in the same block unless they can be guaranteed to be loop invariant. This expands that to any stores that are in the same Loop, at the same loop level. This should still account for where AA/MSSA will not handle aliasing between loops, but allow the dead stores to be removed where they overlap in the same loop iteration. It requires adding loop info to DSE, but that looks fairly harmless. The test case this helps is from code like this, which can come up in certain matrix operations: for(i=..) dst[i] = 0; for(j=..) dst[i] += src[i*n+j]; After LICM, this becomes: for(i=..) dst[i] = 0; sum = 0; for(j=..) sum += src[i*n+j]; dst[i] = sum; The first store is dead, and with this patch is now removed. Differntial Revision: https://reviews.llvm.org/D100464
This commit is contained in:
parent
24ee6d3d3c
commit
222aeb4d51
|
@ -40,10 +40,12 @@
|
|||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/CaptureTracking.h"
|
||||
#include "llvm/Analysis/GlobalsModRef.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/MemoryBuiltins.h"
|
||||
#include "llvm/Analysis/MemoryLocation.h"
|
||||
#include "llvm/Analysis/MemorySSA.h"
|
||||
#include "llvm/Analysis/MemorySSAUpdater.h"
|
||||
#include "llvm/Analysis/MustExecute.h"
|
||||
#include "llvm/Analysis/PostDominators.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
|
@ -853,6 +855,11 @@ struct DSEState {
|
|||
PostDominatorTree &PDT;
|
||||
const TargetLibraryInfo &TLI;
|
||||
const DataLayout &DL;
|
||||
const LoopInfo &LI;
|
||||
|
||||
// Whether the function contains any irreducible control flow, useful for
|
||||
// being accurately able to detect loops.
|
||||
bool ContainsIrreducibleLoops;
|
||||
|
||||
// All MemoryDefs that potentially could kill other MemDefs.
|
||||
SmallVector<MemoryDef *, 64> MemDefs;
|
||||
|
@ -876,14 +883,15 @@ struct DSEState {
|
|||
DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
|
||||
|
||||
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
|
||||
PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
|
||||
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
|
||||
const LoopInfo &LI)
|
||||
: F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
|
||||
DL(F.getParent()->getDataLayout()) {}
|
||||
DL(F.getParent()->getDataLayout()), LI(LI) {}
|
||||
|
||||
static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
||||
DominatorTree &DT, PostDominatorTree &PDT,
|
||||
const TargetLibraryInfo &TLI) {
|
||||
DSEState State(F, AA, MSSA, DT, PDT, TLI);
|
||||
const TargetLibraryInfo &TLI, const LoopInfo &LI) {
|
||||
DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
|
||||
// Collect blocks with throwing instructions not modeled in MemorySSA and
|
||||
// alloc-like objects.
|
||||
unsigned PO = 0;
|
||||
|
@ -911,6 +919,9 @@ struct DSEState {
|
|||
State.InvisibleToCallerAfterRet.insert({&AI, true});
|
||||
}
|
||||
|
||||
// Collect whether there is any irreducible control flow in the function.
|
||||
State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
|
||||
|
||||
return State;
|
||||
}
|
||||
|
||||
|
@ -925,6 +936,12 @@ struct DSEState {
|
|||
isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
|
||||
const MemoryLocation &Later, const MemoryLocation &Earlier,
|
||||
int64_t &EarlierOff, int64_t &LaterOff) {
|
||||
// AliasAnalysis does not always account for loops. Limit overwrite checks
|
||||
// to dependencies for which we can guarantee they are independant of any
|
||||
// loops they are in.
|
||||
if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
|
||||
return OW_Unknown;
|
||||
|
||||
// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
|
||||
// get imprecise values here, though (except for unknown sizes).
|
||||
if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
|
||||
|
@ -1250,11 +1267,27 @@ struct DSEState {
|
|||
return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
|
||||
}
|
||||
|
||||
/// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
|
||||
/// loop. In particular, this guarantees that it only references a single
|
||||
/// MemoryLocation during execution of the containing function.
|
||||
bool IsGuaranteedLoopInvariant(Value *Ptr) {
|
||||
auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
|
||||
/// Returns true if a dependency between \p Current and \p KillingDef is
|
||||
/// guaranteed to be loop invariant for the loops that they are in. Either
|
||||
/// because they are known to be in the same block, in the same loop level or
|
||||
/// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
|
||||
/// during execution of the containing function.
|
||||
bool isGuaranteedLoopIndependent(const Instruction *Current,
|
||||
const Instruction *KillingDef,
|
||||
const MemoryLocation &CurrentLoc) {
|
||||
// If the dependency is within the same block or loop level (being careful of
|
||||
// irreducible loops), we know that AA will return a valid result for the
|
||||
// memory dependency. (Both at the function level, outside of any loop,
|
||||
// would also be valid but we currently disable that to limit compile time).
|
||||
if (Current->getParent() == KillingDef->getParent())
|
||||
return true;
|
||||
const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
|
||||
if (!ContainsIrreducibleLoops && CurrentLI &&
|
||||
CurrentLI == LI.getLoopFor(KillingDef->getParent()))
|
||||
return true;
|
||||
|
||||
// Otherwise check the memory location is invariant to any loops.
|
||||
auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
|
||||
Ptr = Ptr->stripPointerCasts();
|
||||
if (auto *I = dyn_cast<Instruction>(Ptr)) {
|
||||
if (isa<AllocaInst>(Ptr))
|
||||
|
@ -1268,7 +1301,7 @@ struct DSEState {
|
|||
return true;
|
||||
};
|
||||
|
||||
Ptr = Ptr->stripPointerCasts();
|
||||
const Value *Ptr = CurrentLoc.Ptr->stripPointerCasts();
|
||||
if (auto *I = dyn_cast<Instruction>(Ptr)) {
|
||||
if (I->getParent()->isEntryBlock())
|
||||
return true;
|
||||
|
@ -1398,9 +1431,9 @@ struct DSEState {
|
|||
|
||||
// AliasAnalysis does not account for loops. Limit elimination to
|
||||
// candidates for which we can guarantee they always store to the same
|
||||
// memory location and not multiple locations in a loop.
|
||||
if (Current->getBlock() != KillingDef->getBlock() &&
|
||||
!IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
|
||||
// memory location and not located in different loops.
|
||||
if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
|
||||
LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n");
|
||||
StepAgain = true;
|
||||
Current = CurrentDef->getDefiningAccess();
|
||||
WalkerStepLimit -= 1;
|
||||
|
@ -1836,12 +1869,13 @@ struct DSEState {
|
|||
}
|
||||
};
|
||||
|
||||
bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
||||
DominatorTree &DT, PostDominatorTree &PDT,
|
||||
const TargetLibraryInfo &TLI) {
|
||||
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
||||
DominatorTree &DT, PostDominatorTree &PDT,
|
||||
const TargetLibraryInfo &TLI,
|
||||
const LoopInfo &LI) {
|
||||
bool MadeChange = false;
|
||||
|
||||
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
|
||||
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
|
||||
// For each store:
|
||||
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
|
||||
MemoryDef *KillingDef = State.MemDefs[I];
|
||||
|
@ -2014,8 +2048,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
|
|||
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
||||
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
|
||||
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
|
||||
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
|
||||
|
||||
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
|
||||
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
|
||||
|
||||
#ifdef LLVM_ENABLE_STATS
|
||||
if (AreStatisticsEnabled())
|
||||
|
@ -2029,6 +2064,7 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
|
|||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
PA.preserve<MemorySSAAnalysis>();
|
||||
PA.preserve<LoopAnalysis>();
|
||||
return PA;
|
||||
}
|
||||
|
||||
|
@ -2054,8 +2090,9 @@ public:
|
|||
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
|
||||
PostDominatorTree &PDT =
|
||||
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
||||
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
|
||||
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
|
||||
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
|
||||
|
||||
#ifdef LLVM_ENABLE_STATS
|
||||
if (AreStatisticsEnabled())
|
||||
|
@ -2077,6 +2114,8 @@ public:
|
|||
AU.addRequired<MemorySSAWrapperPass>();
|
||||
AU.addPreserved<PostDominatorTreeWrapperPass>();
|
||||
AU.addPreserved<MemorySSAWrapperPass>();
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2093,6 +2132,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
|||
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
||||
INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
|
||||
false)
|
||||
|
||||
|
|
|
@ -515,8 +515,8 @@
|
|||
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||
; GCN-O2-NEXT: Memory SSA
|
||||
; GCN-O2-NEXT: MemCpy Optimization
|
||||
; GCN-O2-NEXT: Dead Store Elimination
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
; GCN-O2-NEXT: Dead Store Elimination
|
||||
; GCN-O2-NEXT: Canonicalize natural loops
|
||||
; GCN-O2-NEXT: LCSSA Verifier
|
||||
; GCN-O2-NEXT: Loop-Closed SSA Form Pass
|
||||
|
@ -874,8 +874,8 @@
|
|||
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||
; GCN-O3-NEXT: Memory SSA
|
||||
; GCN-O3-NEXT: MemCpy Optimization
|
||||
; GCN-O3-NEXT: Dead Store Elimination
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
; GCN-O3-NEXT: Dead Store Elimination
|
||||
; GCN-O3-NEXT: Canonicalize natural loops
|
||||
; GCN-O3-NEXT: LCSSA Verifier
|
||||
; GCN-O3-NEXT: Loop-Closed SSA Form Pass
|
||||
|
|
|
@ -162,8 +162,8 @@
|
|||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: Memory SSA
|
||||
; CHECK-NEXT: MemCpy Optimization
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Natural Loop Information
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Canonicalize natural loops
|
||||
; CHECK-NEXT: LCSSA Verifier
|
||||
; CHECK-NEXT: Loop-Closed SSA Form Pass
|
||||
|
|
|
@ -167,8 +167,8 @@
|
|||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: Memory SSA
|
||||
; CHECK-NEXT: MemCpy Optimization
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Natural Loop Information
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Canonicalize natural loops
|
||||
; CHECK-NEXT: LCSSA Verifier
|
||||
; CHECK-NEXT: Loop-Closed SSA Form Pass
|
||||
|
|
|
@ -167,8 +167,8 @@
|
|||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: Memory SSA
|
||||
; CHECK-NEXT: MemCpy Optimization
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Natural Loop Information
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Canonicalize natural loops
|
||||
; CHECK-NEXT: LCSSA Verifier
|
||||
; CHECK-NEXT: Loop-Closed SSA Form Pass
|
||||
|
|
|
@ -148,8 +148,8 @@
|
|||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: Memory SSA
|
||||
; CHECK-NEXT: MemCpy Optimization
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Natural Loop Information
|
||||
; CHECK-NEXT: Dead Store Elimination
|
||||
; CHECK-NEXT: Canonicalize natural loops
|
||||
; CHECK-NEXT: LCSSA Verifier
|
||||
; CHECK-NEXT: Loop-Closed SSA Form Pass
|
||||
|
|
|
@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias
|
|||
; CHECK: for.body4.lr.ph:
|
||||
; CHECK-NEXT: [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
|
||||
; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
|
||||
; CHECK-NEXT: br label [[FOR_BODY4:%.*]]
|
||||
; CHECK: for.body4:
|
||||
|
@ -626,10 +625,13 @@ exit:
|
|||
define i16 @partial_override_overloop(i1 %c, i32 %i) {
|
||||
; CHECK-LABEL: @partial_override_overloop(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label [[FIRST:%.*]]
|
||||
; CHECK: first:
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 [[I:%.*]]
|
||||
; CHECK-NEXT: store i16 1, i16* [[ARRAYIDX]], align 1
|
||||
; CHECK-NEXT: br label [[DO_BODY:%.*]]
|
||||
; CHECK: do.body:
|
||||
; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[DO_BODY]] ]
|
||||
; CHECK-NEXT: [[I_0:%.*]] = phi i16 [ 0, [[FIRST]] ], [ [[INC:%.*]], [[DO_BODY]] ]
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]]
|
||||
; CHECK-NEXT: store i16 2, i16* [[ARRAYIDX2]], align 1
|
||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i16 [[I_0]], 4
|
||||
|
@ -643,12 +645,16 @@ define i16 @partial_override_overloop(i1 %c, i32 %i) {
|
|||
; CHECK-NEXT: ret i16 0
|
||||
;
|
||||
entry:
|
||||
; Branch to first so MemoryLoc is not in the entry block.
|
||||
br label %first
|
||||
|
||||
first:
|
||||
%arrayidx = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 %i
|
||||
store i16 1, i16* %arrayidx, align 1
|
||||
br label %do.body
|
||||
|
||||
do.body:
|
||||
%i.0 = phi i16 [ 0, %entry ], [ %inc, %do.body ]
|
||||
%i.0 = phi i16 [ 0, %first ], [ %inc, %do.body ]
|
||||
%arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0
|
||||
store i16 2, i16* %arrayidx2, align 1
|
||||
%exitcond = icmp eq i16 %i.0, 4
|
||||
|
|
Loading…
Reference in New Issue