[LSR] Canonicalize formula and put recursive Reg related with current loop in ScaledReg.

After rL294814, LSR formula can have multiple SCEVAddRecExprs inside of its BaseRegs. Previous canonicalization will swap the first SCEVAddRecExpr in BaseRegs with ScaledReg. But now we want to swap the SCEVAddRecExpr Reg related with current loop with ScaledReg. Otherwise, we may generate code like this: RegA + lsr.iv + RegB, where loop invariant parts RegA and RegB are not grouped together and cannot be promoted outside of loop. With this patch, it will ensure lsr.iv to be generated later in the expr: RegA + RegB + lsr.iv, so that RegA + RegB can be promoted outside of loop. Differential Revision: https://reviews.llvm.org/D26781 llvm-svn: 295884
2017-02-22 21:47:08 +00:00 · 2017-02-22 21:47:08 +00:00 · 74d5a90fa6
parent 3596a81c69
commit 74d5a90fa6
2 changed files with 148 additions and 39 deletions
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -307,9 +307,13 @@ struct Formula {
  /// canonical representation of a formula is
  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
  /// #1 enforces that the scaled register is always used when at least two
  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
  /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
  /// This invariant can be temporarly broken while building a formula.
  /// However, every formula inserted into the LSRInstance must be in canonical
  /// form.
@ -330,9 +334,9 @@ struct Formula {

  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);

-  bool isCanonical() const;
+  bool isCanonical(const Loop &L) const;

-  void canonicalize();
+  void canonicalize(const Loop &L);

  bool unscale();

@ -424,16 +428,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
      BaseRegs.push_back(Sum);
    HasBaseReg = true;
  }
-  canonicalize();
+  canonicalize(*L);
 }

 /// \brief Check whether or not this formula statisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
-  if (ScaledReg)
-    return Scale != 1 || !BaseRegs.empty();
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
 }

 /// \brief Helper method to morph a formula into its canonical representation.
@ -442,21 +465,33 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
-  if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
    return;
  // So far we did not need this case. This is easy to implement but it is
  // useless to maintain dead code. Beside it could hurt compile time.
  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  if (!ScaledReg) {
    ScaledReg = BaseRegs.back();
    BaseRegs.pop_back();
    Scale = 1;
-  size_t BaseRegsSize = BaseRegs.size();
-  size_t Try = 0;
-  // If ScaledReg is an invariant, try to find a variant expression.
-  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
-    std::swap(ScaledReg, BaseRegs[Try++]);
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
 }

 /// \brief Get rid of the scale in the formula.
@ -908,7 +943,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                 const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);

 namespace {

@ -1102,7 +1138,7 @@ public:
  
  bool HasFormulaWithSameRegs(const Formula &F) const;
  float getNotSelectedProbability(const SCEV *Reg) const;
-  bool InsertFormula(const Formula &F);
+  bool InsertFormula(const Formula &F, const Loop &L);
  void DeleteFormula(Formula &F);
  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);

@ -1191,7 +1227,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                       ScalarEvolution &SE, DominatorTree &DT,
                       const LSRUse &LU,
                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
  // Tally up the registers.
  unsigned PrevAddRecCost = AddRecCost;
  unsigned PrevNumRegs = NumRegs;
@ -1237,7 +1273,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
  NumBaseAdds += (F.UnfoldedOffset != 0);

  // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
+  ScaleCost += getScalingFactorCost(TTI, LU, F, *L);

  // Tally up the non-zero immediates.
  for (const LSRFixup &Fixup : LU.Fixups) {
@ -1391,8 +1427,8 @@ float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {

 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.  The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
-  assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");

  if (!Formulae.empty() && RigidFormula)
    return false;
@ -1562,7 +1598,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                 int64_t MinOffset, int64_t MaxOffset,
                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 const Formula &F) {
+                                 const Formula &F, const Loop &L) {
  // For the purpose of isAMCompletelyFolded either having a canonical formula
  // or a scale not equal to zero is correct.
  // Problems may arise from non canonical formulae having a scale == 0.
@ -1570,7 +1606,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
  // However, when we generate the scaled formulae, we first check that the
  // scaling factor is profitable before computing the actual ScaledReg for
  // compile time sake.
-  assert((F.isCanonical() || F.Scale != 0));
+  assert((F.isCanonical(L) || F.Scale != 0));
  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@ -1605,14 +1641,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }

 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
  if (!F.Scale)
    return 0;

  // If the use is not completely folded in that instruction, we will have to
  // pay an extra cost only for scale != 1.
  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                            LU.AccessTy, F))
+                            LU.AccessTy, F, L))
    return F.Scale != 1;

  switch (LU.Kind) {
@ -3206,7 +3243,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
  // Do not insert formula that we will not be able to expand.
  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
         "Formula is illegal");
-  if (!LU.InsertFormula(F))
+
+  if (!LU.InsertFormula(F, *L))
    return false;

  CountRegisters(F, LUIdx);
@ -3445,7 +3483,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
      F.BaseRegs.push_back(*J);
    // We may have changed the number of register in base regs, adjust the
    // formula accordingly.
-    F.canonicalize();
+    F.canonicalize(*L);

    if (InsertFormula(LU, LUIdx, F))
      // If that formula hadn't been seen before, recurse to find more like
@ -3457,7 +3495,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
 /// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                         Formula Base, unsigned Depth) {
-  assert(Base.isCanonical() && "Input must be in the canonical form");
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
  // Arbitrarily cap recursion to protect compile time.
  if (Depth >= 3)
    return;
@ -3498,7 +3536,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
    // rather than proceed with zero in a register.
    if (!Sum->isZero()) {
      F.BaseRegs.push_back(Sum);
-      F.canonicalize();
+      F.canonicalize(*L);
      (void)InsertFormula(LU, LUIdx, F);
    }
  }
@ -3555,7 +3593,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
          F.ScaledReg = nullptr;
        } else
          F.deleteBaseReg(F.BaseRegs[Idx]);
-        F.canonicalize();
+        F.canonicalize(*L);
      } else if (IsScaledReg)
        F.ScaledReg = NewG;
      else
@ -3718,10 +3756,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
    if (LU.Kind == LSRUse::ICmpZero &&
        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
      continue;
-    // For each addrec base reg, apply the scale, if possible.
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
-      if (const SCEVAddRecExpr *AR =
-            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
        if (FactorS->isZero())
          continue;
@ -3735,12 +3773,18 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
          // The canonical representation of 1*reg is reg, which is already in
          // Base. In that case, do not try to insert the formula, it will be
          // rejected anyway.
-          if (F.Scale == 1 && F.BaseRegs.empty())
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
            continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
          (void)InsertFormula(LU, LUIdx, F);
        }
      }
    }
+  }
 }

 /// Generate reuse formulae from different IV types.
@ -3920,7 +3964,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
            continue;

        // OK, looks good.
-        NewF.canonicalize();
+        NewF.canonicalize(*this->L);
        (void)InsertFormula(LU, LUIdx, NewF);
      } else {
        // Use the immediate in a base register.
@ -3952,7 +3996,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                goto skip_formula;

          // Ok, looks good.
-          NewF.canonicalize();
+          NewF.canonicalize(*this->L);
          (void)InsertFormula(LU, LUIdx, NewF);
          break;
        skip_formula:;
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/canonical.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/canonical.ll
@ -0,0 +1,65 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s | FileCheck %s
+; Check LSR formula canonicalization will put loop invariant regs before
+; induction variable of current loop, so exprs involving loop invariant regs
+; can be promoted outside of current loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32 %size, i32 %nsteps, i8* nocapture %maxarray, i8* nocapture readnone %buffer, i32 %init) local_unnamed_addr #0 {
+entry:
+  %cmp25 = icmp sgt i32 %nsteps, 0
+  br i1 %cmp25, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp223 = icmp sgt i32 %size, 1
+  %t0 = sext i32 %init to i64
+  %wide.trip.count = zext i32 %size to i64
+  %wide.trip.count31 = zext i32 %nsteps to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %for.cond1.preheader.lr.ph
+  %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.inc10 ]
+  br i1 %cmp223, label %for.body3.lr.ph, label %for.inc10
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %t1 = add nsw i64 %indvars.iv28, %t0
+  %t2 = trunc i64 %indvars.iv28 to i8
+  br label %for.body3
+
+; Make sure loop invariant items are grouped together so that load address can
+; be represented in one getelementptr.
+; CHECK-LABEL: for.body3:
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ 1, %for.body3.lr.ph ], [ {{.*}}, %for.body3 ]
+; CHECK-NOT: = phi i64
+; CHECK-NEXT: [[LOADADDR:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK-NEXT: = load i8, i8* [[LOADADDR]], align 1
+; CHECK: br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %t5 = trunc i64 %indvars.iv to i8
+  %t3 = add nsw i64 %t1, %indvars.iv
+  %arrayidx = getelementptr inbounds i8, i8* %maxarray, i64 %t3
+  %t4 = load i8, i8* %arrayidx, align 1
+  %add5 = add i8 %t4, %t5
+  %add6 = add i8 %add5, %t2
+  %arrayidx9 = getelementptr inbounds i8, i8* %maxarray, i64 %indvars.iv
+  store i8 %add6, i8* %arrayidx9, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+  %exitcond32 = icmp eq i64 %indvars.iv.next29, %wide.trip.count31
+  br i1 %exitcond32, label %for.end12.loopexit, label %for.cond1.preheader
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}