forked from OSchip/llvm-project
Fix PR23384 (under "-lsr-insns-cost" option)
Summary: The patch adds instructions number generated by a solution to LSR cost under "-lsr-insns-cost" option. Reviewers: qcolombet, hfinkel Differential Revision: http://reviews.llvm.org/D28307 From: Evgeny Stupachenko <evstupac@gmail.com> llvm-svn: 294821
This commit is contained in:
parent
a05bdf75c0
commit
fe6f548d2d
|
@ -129,6 +129,11 @@ static cl::opt<bool> EnablePhiElim(
|
|||
"enable-lsr-phielim", cl::Hidden, cl::init(true),
|
||||
cl::desc("Enable LSR phi elimination"));
|
||||
|
||||
// The flag adds instruction count to solutions cost comparision.
|
||||
static cl::opt<bool> InsnsCost(
|
||||
"lsr-insns-cost", cl::Hidden, cl::init(false),
|
||||
cl::desc("Add instruction count to a LSR cost model"));
|
||||
|
||||
#ifndef NDEBUG
|
||||
// Stress test IV chain generation.
|
||||
static cl::opt<bool> StressIVChain(
|
||||
|
@ -325,6 +330,8 @@ struct Formula {
|
|||
|
||||
bool unscale();
|
||||
|
||||
bool hasZeroEnd() const;
|
||||
|
||||
size_t getNumRegs() const;
|
||||
Type *getType() const;
|
||||
|
||||
|
@ -459,6 +466,14 @@ bool Formula::unscale() {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Formula::hasZeroEnd() const {
|
||||
if (UnfoldedOffset || BaseOffset)
|
||||
return false;
|
||||
if (BaseRegs.size() != 1 || ScaledReg)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Return the total number of register operands used by this formula. This does
|
||||
/// not include register uses implied by non-constant addrec strides.
|
||||
size_t Formula::getNumRegs() const {
|
||||
|
@ -895,6 +910,7 @@ namespace {
|
|||
class Cost {
|
||||
/// TODO: Some of these could be merged. Also, a lexical ordering
|
||||
/// isn't always optimal.
|
||||
unsigned Insns;
|
||||
unsigned NumRegs;
|
||||
unsigned AddRecCost;
|
||||
unsigned NumIVMuls;
|
||||
|
@ -905,8 +921,8 @@ class Cost {
|
|||
|
||||
public:
|
||||
Cost()
|
||||
: NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
|
||||
SetupCost(0), ScaleCost(0) {}
|
||||
: Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
|
||||
ImmCost(0), SetupCost(0), ScaleCost(0) {}
|
||||
|
||||
bool operator<(const Cost &Other) const;
|
||||
|
||||
|
@ -915,9 +931,9 @@ public:
|
|||
#ifndef NDEBUG
|
||||
// Once any of the metrics loses, they must all remain losers.
|
||||
bool isValid() {
|
||||
return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
|
||||
return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
|
||||
| ImmCost | SetupCost | ScaleCost) != ~0u)
|
||||
|| ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
|
||||
|| ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
|
||||
& ImmCost & SetupCost & ScaleCost) == ~0u);
|
||||
}
|
||||
#endif
|
||||
|
@ -1163,6 +1179,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
|
|||
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
|
||||
assert(F.isCanonical() && "Cost is accurate only for canonical formula");
|
||||
// Tally up the registers.
|
||||
unsigned PrevAddRecCost = AddRecCost;
|
||||
unsigned PrevNumRegs = NumRegs;
|
||||
unsigned PrevNumBaseAdds = NumBaseAdds;
|
||||
if (const SCEV *ScaledReg = F.ScaledReg) {
|
||||
if (VisitedRegs.count(ScaledReg)) {
|
||||
Lose();
|
||||
|
@ -1182,6 +1201,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
|
|||
return;
|
||||
}
|
||||
|
||||
// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
|
||||
// additional instruction (at least fill).
|
||||
unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
|
||||
if (NumRegs > TTIRegNum) {
|
||||
// Cost already exceeded TTIRegNum, then only newly added register can add
|
||||
// new instructions.
|
||||
if (PrevNumRegs > TTIRegNum)
|
||||
Insns += (NumRegs - PrevNumRegs);
|
||||
else
|
||||
Insns += (NumRegs - TTIRegNum);
|
||||
}
|
||||
|
||||
// Determine how many (unfolded) adds we'll need inside the loop.
|
||||
size_t NumBaseParts = F.getNumRegs();
|
||||
if (NumBaseParts > 1)
|
||||
|
@ -1210,11 +1241,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
|
|||
!TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
|
||||
NumBaseAdds++;
|
||||
}
|
||||
|
||||
// If ICmpZero formula ends with not 0, it could not be replaced by
|
||||
// just add or sub. We'll need to compare final result of AddRec.
|
||||
// That means we'll need an additional instruction.
|
||||
// For -10 + {0, +, 1}:
|
||||
// i = i + 1;
|
||||
// cmp i, 10
|
||||
//
|
||||
// For {-10, +, 1}:
|
||||
// i = i + 1;
|
||||
if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
|
||||
Insns++;
|
||||
// Each new AddRec adds 1 instruction to calculation.
|
||||
Insns += (AddRecCost - PrevAddRecCost);
|
||||
|
||||
// BaseAdds adds instructions for unfolded registers.
|
||||
if (LU.Kind != LSRUse::ICmpZero)
|
||||
Insns += NumBaseAdds - PrevNumBaseAdds;
|
||||
assert(isValid() && "invalid cost");
|
||||
}
|
||||
|
||||
/// Set this cost to a losing value.
|
||||
void Cost::Lose() {
|
||||
Insns = ~0u;
|
||||
NumRegs = ~0u;
|
||||
AddRecCost = ~0u;
|
||||
NumIVMuls = ~0u;
|
||||
|
@ -1226,6 +1276,8 @@ void Cost::Lose() {
|
|||
|
||||
/// Choose the lower cost.
|
||||
bool Cost::operator<(const Cost &Other) const {
|
||||
if (InsnsCost && Insns != Other.Insns)
|
||||
return Insns < Other.Insns;
|
||||
return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
|
||||
ImmCost, SetupCost) <
|
||||
std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
|
||||
|
@ -1234,6 +1286,7 @@ bool Cost::operator<(const Cost &Other) const {
|
|||
}
|
||||
|
||||
void Cost::print(raw_ostream &OS) const {
|
||||
OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
|
||||
OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
|
||||
if (AddRecCost != 0)
|
||||
OS << ", with addrec cost " << AddRecCost;
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
|
||||
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
|
||||
; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
|
||||
|
||||
; OPT test checks that LSR optimize compare for static counter to compare with 0.
|
||||
|
||||
; BOTH: for.body:
|
||||
; INSN: icmp eq i64 %lsr.iv.next, 0
|
||||
; REGS: icmp eq i64 %indvars.iv.next, 1024
|
||||
|
||||
; LLC test checks that LSR optimize compare for static counter.
|
||||
; That means that instead of creating the following:
|
||||
; movl %ecx, (%rdx,%rax,4)
|
||||
; incq %rax
|
||||
; cmpq $1024, %rax
|
||||
; LSR should optimize out cmp:
|
||||
; movl %ecx, 4096(%rdx,%rax)
|
||||
; addq $4, %rax
|
||||
; or
|
||||
; movl %ecx, 4096(%rdx,%rax,4)
|
||||
; incq %rax
|
||||
|
||||
; CHECK: LBB0_1:
|
||||
; CHECK-NEXT: movl 4096(%{{...}},[[REG:%...]]
|
||||
; CHECK-NEXT: addl 4096(%{{...}},[[REG]]
|
||||
; CHECK-NEXT: movl %{{...}}, 4096(%{{...}},[[REG]]
|
||||
; CHECK-NOT: cmp
|
||||
; CHECK: jne
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
; Function Attrs: norecurse nounwind uwtable
|
||||
define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
|
||||
%tmp = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
|
||||
%tmp1 = load i32, i32* %arrayidx2, align 4
|
||||
%add = add nsw i32 %tmp1, %tmp
|
||||
%arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
|
||||
store i32 %add, i32* %arrayidx4, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 1024
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
|
||||
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
|
||||
; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
|
||||
|
||||
; OPT checks that LSR prefers less instructions to less registers.
|
||||
; For x86 LSR should prefer complicated address to new lsr induction
|
||||
; variables.
|
||||
|
||||
; BOTH: for.body:
|
||||
; INSN: getelementptr i32, i32* %x, i64 %indvars.iv
|
||||
; INSN: getelementptr i32, i32* %y, i64 %indvars.iv
|
||||
; INSN: getelementptr i32, i32* %q, i64 %indvars.iv
|
||||
; REGS %lsr.iv4 = phi
|
||||
; REGS %lsr.iv2 = phi
|
||||
; REGS %lsr.iv1 = phi
|
||||
; REGS: getelementptr i32, i32* %lsr.iv1, i64 1
|
||||
; REGS: getelementptr i32, i32* %lsr.iv2, i64 1
|
||||
; REGS: getelementptr i32, i32* %lsr.iv4, i64 1
|
||||
|
||||
; LLC checks that LSR prefers less instructions to less registers.
|
||||
; LSR should prefer complicated address to additonal add instructions.
|
||||
|
||||
; CHECK: LBB0_2:
|
||||
; CHECK-NEXT: movl (%r{{[a-z][a-z]}},
|
||||
; CHECK-NEXT: addl (%r{{[a-z][a-z]}},
|
||||
; CHECK-NEXT: movl %e{{[a-z][a-z]}}, (%r{{[a-z][a-z]}},
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
; Function Attrs: norecurse nounwind uwtable
|
||||
define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
|
||||
entry:
|
||||
%cmp10 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
%wide.trip.count = zext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.body
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body, %for.body.preheader
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
|
||||
%tmp = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
|
||||
%tmp1 = load i32, i32* %arrayidx2, align 4
|
||||
%add = add nsw i32 %tmp1, %tmp
|
||||
%arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
|
||||
store i32 %add, i32* %arrayidx4, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
Loading…
Reference in New Issue