Fix PR23384 (under "-lsr-insns-cost" option)

Summary: The patch adds instructions number generated by a solution to LSR cost under "-lsr-insns-cost" option. Reviewers: qcolombet, hfinkel Differential Revision: http://reviews.llvm.org/D28307 From: Evgeny Stupachenko <evstupac@gmail.com> llvm-svn: 294821
2017-02-11 02:57:43 +00:00 · 2017-02-11 02:57:43 +00:00 · fe6f548d2d
parent a05bdf75c0
commit fe6f548d2d
3 changed files with 167 additions and 4 deletions
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -129,6 +129,11 @@ static cl::opt<bool> EnablePhiElim(
  "enable-lsr-phielim", cl::Hidden, cl::init(true),
  cl::desc("Enable LSR phi elimination"));

+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@ -325,6 +330,8 @@ struct Formula {

  bool unscale();

+  bool hasZeroEnd() const;
+
  size_t getNumRegs() const;
  Type *getType() const;

@ -459,6 +466,14 @@ bool Formula::unscale() {
  return true;
 }

+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@ -895,6 +910,7 @@ namespace {
 class Cost {
  /// TODO: Some of these could be merged. Also, a lexical ordering
  /// isn't always optimal.
+  unsigned Insns;
  unsigned NumRegs;
  unsigned AddRecCost;
  unsigned NumIVMuls;
@ -905,8 +921,8 @@ class Cost {

 public:
  Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+    : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+      ImmCost(0), SetupCost(0), ScaleCost(0) {}

  bool operator<(const Cost &Other) const;

@ -915,9 +931,9 @@ public:
 #ifndef NDEBUG
  // Once any of the metrics loses, they must all remain losers.
  bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+    return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
             | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+      || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
           & ImmCost & SetupCost & ScaleCost) == ~0u);
  }
 #endif
@ -1163,6 +1179,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
  // Tally up the registers.
+  unsigned PrevAddRecCost = AddRecCost;
+  unsigned PrevNumRegs = NumRegs;
+  unsigned PrevNumBaseAdds = NumBaseAdds;
  if (const SCEV *ScaledReg = F.ScaledReg) {
    if (VisitedRegs.count(ScaledReg)) {
      Lose();
@ -1182,6 +1201,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
      return;
  }

+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      Insns += (NumRegs - PrevNumRegs);
+    else
+      Insns += (NumRegs - TTIRegNum);
+  }
+
  // Determine how many (unfolded) adds we'll need inside the loop.
  size_t NumBaseParts = F.getNumRegs();
  if (NumBaseParts > 1)
@ -1210,11 +1241,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
        !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
      NumBaseAdds++;
  }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  Insns += (AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    Insns += NumBaseAdds - PrevNumBaseAdds;
  assert(isValid() && "invalid cost");
 }

 /// Set this cost to a losing value.
 void Cost::Lose() {
+  Insns = ~0u;
  NumRegs = ~0u;
  AddRecCost = ~0u;
  NumIVMuls = ~0u;
@ -1226,6 +1276,8 @@ void Cost::Lose() {

 /// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
+  if (InsnsCost && Insns != Other.Insns)
+    return Insns < Other.Insns;
  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                  ImmCost, SetupCost) <
         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@ -1234,6 +1286,7 @@ bool Cost::operator<(const Cost &Other) const {
 }

 void Cost::print(raw_ostream &OS) const {
+  OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
  if (AddRecCost != 0)
    OS << ", with addrec cost " << AddRecCost;
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT test checks that LSR optimize compare for static counter to compare with 0.
+
+; BOTH: for.body:
+; INSN: icmp eq i64 %lsr.iv.next, 0
+; REGS: icmp eq i64 %indvars.iv.next, 1024
+
+; LLC test checks that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+;   movl %ecx, (%rdx,%rax,4)
+;   incq %rax
+;   cmpq $1024, %rax
+; LSR should optimize out cmp:
+;   movl %ecx, 4096(%rdx,%rax)
+;   addq $4, %rax
+; or
+;   movl %ecx, 4096(%rdx,%rax,4)
+;   incq %rax
+
+; CHECK:      LBB0_1:
+; CHECK-NEXT:   movl 4096(%{{...}},[[REG:%...]]
+; CHECK-NEXT:   addl 4096(%{{...}},[[REG]]
+; CHECK-NEXT:   movl %{{...}}, 4096(%{{...}},[[REG]]
+; CHECK-NOT:    cmp
+; CHECK:        jne
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT checks that LSR prefers less instructions to less registers.
+; For x86 LSR should prefer complicated address to new lsr induction
+; variables.
+
+; BOTH: for.body:
+; INSN:   getelementptr i32, i32* %x, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %y, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %q, i64 %indvars.iv
+; REGS    %lsr.iv4 = phi
+; REGS    %lsr.iv2 = phi
+; REGS    %lsr.iv1 = phi
+; REGS:   getelementptr i32, i32* %lsr.iv1, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv2, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv4, i64 1
+
+; LLC checks that LSR prefers less instructions to less registers.
+; LSR should prefer complicated address to additonal add instructions.
+
+; CHECK:      LBB0_2:
+; CHECK-NEXT:   movl (%r{{[a-z][a-z]}},
+; CHECK-NEXT:   addl (%r{{[a-z][a-z]}},
+; CHECK-NEXT:   movl %e{{[a-z][a-z]}}, (%r{{[a-z][a-z]}},
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}