forked from OSchip/llvm-project
Modify how the formulae are rated in Loop Strength Reduce.
Namely, check if the target allows to fold more that one register in the addressing mode and if yes, adjust the cost accordingly. Prior to this commit, reg1 + scale * reg2 accesses were artificially preferred to reg1 + reg2 accesses. Indeed, the cost model wrongly assumed that reg1 + reg2 needs a temporary register for the computation, whereas it was correctly estimated for reg1 + scale * reg2. <rdar://problem/13973908> llvm-svn: 183021
This commit is contained in:
parent
f1ed334d55
commit
8aa7abe2ae
|
@ -773,6 +773,13 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
|
||||||
return Changed;
|
return Changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
class LSRUse;
|
||||||
|
}
|
||||||
|
// Check if it is legal to fold 2 base registers.
|
||||||
|
static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
|
||||||
|
const Formula &F);
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
/// Cost - This class is used to measure and compare candidate formulae.
|
/// Cost - This class is used to measure and compare candidate formulae.
|
||||||
|
@ -810,12 +817,14 @@ public:
|
||||||
return NumRegs == ~0u;
|
return NumRegs == ~0u;
|
||||||
}
|
}
|
||||||
|
|
||||||
void RateFormula(const Formula &F,
|
void RateFormula(const TargetTransformInfo &TTI,
|
||||||
|
const Formula &F,
|
||||||
SmallPtrSet<const SCEV *, 16> &Regs,
|
SmallPtrSet<const SCEV *, 16> &Regs,
|
||||||
const DenseSet<const SCEV *> &VisitedRegs,
|
const DenseSet<const SCEV *> &VisitedRegs,
|
||||||
const Loop *L,
|
const Loop *L,
|
||||||
const SmallVectorImpl<int64_t> &Offsets,
|
const SmallVectorImpl<int64_t> &Offsets,
|
||||||
ScalarEvolution &SE, DominatorTree &DT,
|
ScalarEvolution &SE, DominatorTree &DT,
|
||||||
|
const LSRUse &LU,
|
||||||
SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
|
SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
|
||||||
|
|
||||||
void print(raw_ostream &OS) const;
|
void print(raw_ostream &OS) const;
|
||||||
|
@ -900,12 +909,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Cost::RateFormula(const Formula &F,
|
void Cost::RateFormula(const TargetTransformInfo &TTI,
|
||||||
|
const Formula &F,
|
||||||
SmallPtrSet<const SCEV *, 16> &Regs,
|
SmallPtrSet<const SCEV *, 16> &Regs,
|
||||||
const DenseSet<const SCEV *> &VisitedRegs,
|
const DenseSet<const SCEV *> &VisitedRegs,
|
||||||
const Loop *L,
|
const Loop *L,
|
||||||
const SmallVectorImpl<int64_t> &Offsets,
|
const SmallVectorImpl<int64_t> &Offsets,
|
||||||
ScalarEvolution &SE, DominatorTree &DT,
|
ScalarEvolution &SE, DominatorTree &DT,
|
||||||
|
const LSRUse &LU,
|
||||||
SmallPtrSet<const SCEV *, 16> *LoserRegs) {
|
SmallPtrSet<const SCEV *, 16> *LoserRegs) {
|
||||||
// Tally up the registers.
|
// Tally up the registers.
|
||||||
if (const SCEV *ScaledReg = F.ScaledReg) {
|
if (const SCEV *ScaledReg = F.ScaledReg) {
|
||||||
|
@ -932,7 +943,9 @@ void Cost::RateFormula(const Formula &F,
|
||||||
// Determine how many (unfolded) adds we'll need inside the loop.
|
// Determine how many (unfolded) adds we'll need inside the loop.
|
||||||
size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
|
size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
|
||||||
if (NumBaseParts > 1)
|
if (NumBaseParts > 1)
|
||||||
NumBaseAdds += NumBaseParts - 1;
|
// Do not count the base and a possible second register if the target
|
||||||
|
// allows to fold 2 registers.
|
||||||
|
NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
|
||||||
|
|
||||||
// Tally up the non-zero immediates.
|
// Tally up the non-zero immediates.
|
||||||
for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
|
for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
|
||||||
|
@ -1359,6 +1372,30 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
|
||||||
F.BaseOffset, F.HasBaseReg, F.Scale);
|
F.BaseOffset, F.HasBaseReg, F.Scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
|
||||||
|
const Formula &F) {
|
||||||
|
// If F is used as an Addressing Mode, it may fold one Base plus one
|
||||||
|
// scaled register. If the scaled register is nil, do as if another
|
||||||
|
// element of the base regs is a 1-scaled register.
|
||||||
|
// This is possible if BaseRegs has at least 2 registers.
|
||||||
|
|
||||||
|
// If this is not an address calculation, this is not an addressing mode
|
||||||
|
// use.
|
||||||
|
if (LU.Kind != LSRUse::Address)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// F is already scaled.
|
||||||
|
if (F.Scale != 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// We need to keep one register for the base and one to scale.
|
||||||
|
if (F.BaseRegs.size() < 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
|
||||||
|
F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
|
||||||
|
}
|
||||||
|
|
||||||
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
|
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
|
||||||
LSRUse::KindType Kind, Type *AccessTy,
|
LSRUse::KindType Kind, Type *AccessTy,
|
||||||
GlobalValue *BaseGV, int64_t BaseOffset,
|
GlobalValue *BaseGV, int64_t BaseOffset,
|
||||||
|
@ -3690,7 +3727,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
|
||||||
// the corresponding bad register from the Regs set.
|
// the corresponding bad register from the Regs set.
|
||||||
Cost CostF;
|
Cost CostF;
|
||||||
Regs.clear();
|
Regs.clear();
|
||||||
CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
|
CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
|
||||||
&LoserRegs);
|
&LoserRegs);
|
||||||
if (CostF.isLoser()) {
|
if (CostF.isLoser()) {
|
||||||
// During initial formula generation, undesirable formulae are generated
|
// During initial formula generation, undesirable formulae are generated
|
||||||
|
@ -3726,7 +3763,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
|
||||||
|
|
||||||
Cost CostBest;
|
Cost CostBest;
|
||||||
Regs.clear();
|
Regs.clear();
|
||||||
CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
|
CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
|
||||||
|
DT, LU);
|
||||||
if (CostF < CostBest)
|
if (CostF < CostBest)
|
||||||
std::swap(F, Best);
|
std::swap(F, Best);
|
||||||
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
|
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
|
||||||
|
@ -4079,7 +4117,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
|
||||||
// the current best, prune the search at that point.
|
// the current best, prune the search at that point.
|
||||||
NewCost = CurCost;
|
NewCost = CurCost;
|
||||||
NewRegs = CurRegs;
|
NewRegs = CurRegs;
|
||||||
NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
|
NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
|
||||||
|
LU);
|
||||||
if (NewCost < SolutionCost) {
|
if (NewCost < SolutionCost) {
|
||||||
Workspace.push_back(&F);
|
Workspace.push_back(&F);
|
||||||
if (Workspace.size() != Uses.size()) {
|
if (Workspace.size() != Uses.size()) {
|
||||||
|
|
|
@ -3,9 +3,8 @@
|
||||||
; RUN: not grep movz %t
|
; RUN: not grep movz %t
|
||||||
; RUN: not grep sar %t
|
; RUN: not grep sar %t
|
||||||
; RUN: not grep shl %t
|
; RUN: not grep shl %t
|
||||||
; RUN: grep add %t | count 1
|
; RUN: grep add %t | count 5
|
||||||
; RUN: grep inc %t | count 4
|
; RUN: grep inc %t | count 2
|
||||||
; RUN: grep dec %t | count 2
|
|
||||||
; RUN: grep lea %t | count 3
|
; RUN: grep lea %t | count 3
|
||||||
|
|
||||||
; Optimize away zext-inreg and sext-inreg on the loop induction
|
; Optimize away zext-inreg and sext-inreg on the loop induction
|
||||||
|
|
|
@ -10,12 +10,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
||||||
; Verify that nothing uses the "dead" ptrtoint from "undef".
|
; Verify that nothing uses the "dead" ptrtoint from "undef".
|
||||||
; CHECK: @VerifyDiagnosticConsumerTest
|
; CHECK: @VerifyDiagnosticConsumerTest
|
||||||
; CHECK: bb:
|
; CHECK: bb:
|
||||||
; CHECK: %0 = ptrtoint i8* undef to i64
|
; "dead" ptrpoint not emitted (or dead code eliminated) with
|
||||||
; CHECK-NOT: %0
|
; current LSR cost model.
|
||||||
|
; CHECK-NOT: = ptrtoint i8* undef to i64
|
||||||
; CHECK: .lr.ph
|
; CHECK: .lr.ph
|
||||||
; CHECK-NOT: %0
|
; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
|
||||||
; CHECK: sub i64 %7, %tmp6
|
; CHECK: sub i64 [[TMP]], %tmp6
|
||||||
; CHECK-NOT: %0
|
|
||||||
; CHECK: ret void
|
; CHECK: ret void
|
||||||
define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
|
define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
|
||||||
bb:
|
bb:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
; RUN: opt < %s -loop-reduce -S | not grep uglygep
|
; RUN: opt < %s -loop-reduce -S | FileCheck %s
|
||||||
|
|
||||||
; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
|
; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
|
||||||
; should be able to form pretty GEPs.
|
; should be able to form pretty GEPs.
|
||||||
|
@ -6,6 +6,7 @@
|
||||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||||
|
|
||||||
define void @Z4() nounwind {
|
define void @Z4() nounwind {
|
||||||
|
; CHECK: define void @Z4
|
||||||
bb:
|
bb:
|
||||||
br label %bb3
|
br label %bb3
|
||||||
|
|
||||||
|
@ -20,11 +21,26 @@ bb3: ; preds = %bb2, %bb
|
||||||
%t4 = phi i64 [ %t, %bb2 ], [ 0, %bb ] ; <i64> [#uses=3]
|
%t4 = phi i64 [ %t, %bb2 ], [ 0, %bb ] ; <i64> [#uses=3]
|
||||||
br label %bb1
|
br label %bb1
|
||||||
|
|
||||||
|
; CHECK: bb10:
|
||||||
|
; CHECK-NEXT: %t7 = icmp eq i64 %t4, 0
|
||||||
|
; Host %t2 computation outside the loop.
|
||||||
|
; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8* undef, i64 %t4
|
||||||
|
; CHECK-NEXT: br label %bb14
|
||||||
bb10: ; preds = %bb9
|
bb10: ; preds = %bb9
|
||||||
%t7 = icmp eq i64 %t4, 0 ; <i1> [#uses=1]
|
%t7 = icmp eq i64 %t4, 0 ; <i1> [#uses=1]
|
||||||
%t3 = add i64 %t4, 16 ; <i64> [#uses=1]
|
%t3 = add i64 %t4, 16 ; <i64> [#uses=1]
|
||||||
br label %bb14
|
br label %bb14
|
||||||
|
|
||||||
|
; CHECK: bb14:
|
||||||
|
; CHECK-NEXT: store i8 undef, i8* [[SCEVGEP]]
|
||||||
|
; CHECK-NEXT: %t6 = load float** undef
|
||||||
|
; Fold %t3's add within the address.
|
||||||
|
; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float* %t6, i64 4
|
||||||
|
; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float* [[SCEVGEP1]] to i8*
|
||||||
|
; Use the induction variable (%t4) to access the right element
|
||||||
|
; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8* [[SCEVGEP2]], i64 %t4
|
||||||
|
; CHECK-NEXT: store i8 undef, i8* [[ADDRESS]]
|
||||||
|
; CHECK-NEXT: br label %bb14
|
||||||
bb14: ; preds = %bb14, %bb10
|
bb14: ; preds = %bb14, %bb10
|
||||||
%t2 = getelementptr inbounds i8* undef, i64 %t4 ; <i8*> [#uses=1]
|
%t2 = getelementptr inbounds i8* undef, i64 %t4 ; <i8*> [#uses=1]
|
||||||
store i8 undef, i8* %t2
|
store i8 undef, i8* %t2
|
||||||
|
@ -36,9 +52,15 @@ bb14: ; preds = %bb14, %bb10
|
||||||
}
|
}
|
||||||
|
|
||||||
define fastcc void @TransformLine() nounwind {
|
define fastcc void @TransformLine() nounwind {
|
||||||
|
; CHECK: @TransformLine
|
||||||
bb:
|
bb:
|
||||||
br label %loop0
|
br label %loop0
|
||||||
|
|
||||||
|
; CHECK: loop0:
|
||||||
|
; Induction variable is initialized to -2.
|
||||||
|
; CHECK-NEXT: [[PHIIV:%[^ ]+]] = phi i32 [ [[IVNEXT:%[^ ]+]], %loop0 ], [ -2, %bb ]
|
||||||
|
; CHECK-NEXT: [[IVNEXT]] = add i32 [[PHIIV]], 1
|
||||||
|
; CHECK-NEXT: br i1 false, label %loop0, label %bb0
|
||||||
loop0: ; preds = %loop0, %bb
|
loop0: ; preds = %loop0, %bb
|
||||||
%i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ] ; <i32> [#uses=2]
|
%i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ] ; <i32> [#uses=2]
|
||||||
%i0.next = add i32 %i0, 1 ; <i32> [#uses=1]
|
%i0.next = add i32 %i0, 1 ; <i32> [#uses=1]
|
||||||
|
@ -47,18 +69,52 @@ loop0: ; preds = %loop0, %bb
|
||||||
bb0: ; preds = %loop0
|
bb0: ; preds = %loop0
|
||||||
br label %loop1
|
br label %loop1
|
||||||
|
|
||||||
|
; CHECK: loop1:
|
||||||
|
; CHECK-NEXT: %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ]
|
||||||
|
; IVNEXT covers the uses of %i0 and %t0.
|
||||||
|
; Therefore, %t0 has been removed.
|
||||||
|
; The critical edge has been split.
|
||||||
|
; CHECK-NEXT: br i1 false, label %bb2, label %[[LOOP1BB6:.+]]
|
||||||
loop1: ; preds = %bb5, %bb0
|
loop1: ; preds = %bb5, %bb0
|
||||||
%i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ] ; <i32> [#uses=4]
|
%i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ] ; <i32> [#uses=4]
|
||||||
%t0 = add i32 %i0, %i1 ; <i32> [#uses=1]
|
%t0 = add i32 %i0, %i1 ; <i32> [#uses=1]
|
||||||
br i1 false, label %bb2, label %bb6
|
br i1 false, label %bb2, label %bb6
|
||||||
|
|
||||||
|
; CHECK: bb2:
|
||||||
|
; Critical edge split.
|
||||||
|
; CHECK-NEXT: br i1 true, label %[[BB2BB6:[^,]+]], label %bb5
|
||||||
bb2: ; preds = %loop1
|
bb2: ; preds = %loop1
|
||||||
br i1 true, label %bb6, label %bb5
|
br i1 true, label %bb6, label %bb5
|
||||||
|
|
||||||
|
; CHECK: bb5:
|
||||||
|
; CHECK-NEXT: %i1.next = add i32 %i1, 1
|
||||||
|
; CHECK-NEXT: br i1 true, label %[[BB5BB6:[^,]+]], label %loop1
|
||||||
bb5: ; preds = %bb2
|
bb5: ; preds = %bb2
|
||||||
%i1.next = add i32 %i1, 1 ; <i32> [#uses=1]
|
%i1.next = add i32 %i1, 1 ; <i32> [#uses=1]
|
||||||
br i1 true, label %bb6, label %loop1
|
br i1 true, label %bb6, label %loop1
|
||||||
|
|
||||||
|
; bb5 to bb6 split basic block.
|
||||||
|
; CHECK: [[BB5BB6]]:
|
||||||
|
; CHECK-NEXT: [[INITIALVAL:%[^ ]+]] = add i32 [[IVNEXT]], %i1.next
|
||||||
|
; CHECK-NEXT: br label %[[SPLITTOBB6:.+]]
|
||||||
|
|
||||||
|
; bb2 to bb6 split basic block.
|
||||||
|
; CHECK: [[BB2BB6]]:
|
||||||
|
; CHECK-NEXT: br label %[[SPLITTOBB6]]
|
||||||
|
|
||||||
|
; Split basic blocks to bb6.
|
||||||
|
; CHECK: [[SPLITTOBB6]]:
|
||||||
|
; CHECK-NEXT: [[INITP8:%[^ ]+]] = phi i32 [ [[INITIALVAL]], %[[BB5BB6]] ], [ undef, %[[BB2BB6]] ]
|
||||||
|
; CHECK-NEXT: [[INITP9:%[^ ]+]] = phi i32 [ undef, %[[BB5BB6]] ], [ %i1, %[[BB2BB6]] ]
|
||||||
|
; CHECK-NEXT: br label %bb6
|
||||||
|
|
||||||
|
; CHECK: [[LOOP1BB6]]:
|
||||||
|
; CHECK-NEXT: br label %bb6
|
||||||
|
|
||||||
|
; CHECK: bb6:
|
||||||
|
; CHECK-NEXT: %p8 = phi i32 [ undef, %[[LOOP1BB6]] ], [ [[INITP8]], %[[SPLITTOBB6]] ]
|
||||||
|
; CHECK-NEXT: %p9 = phi i32 [ %i1, %[[LOOP1BB6]] ], [ [[INITP9]], %[[SPLITTOBB6]] ]
|
||||||
|
; CHECK-NEXT: unreachable
|
||||||
bb6: ; preds = %bb5, %bb2, %loop1
|
bb6: ; preds = %bb5, %bb2, %loop1
|
||||||
%p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; <i32> [#uses=0]
|
%p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; <i32> [#uses=0]
|
||||||
%p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; <i32> [#uses=0]
|
%p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; <i32> [#uses=0]
|
||||||
|
|
Loading…
Reference in New Issue