[LV] Never widen an induction variable.

There's no need to widen canonical induction variables. It's just as efficient to create a *new*, wide, induction variable. Consider, if we widen an indvar, then we'll have to truncate it before its uses anyway (1 trunc). If we create a new indvar instead, we'll have to truncate that instead (1 trunc) [besides which IndVars should go and clean up our mess after us anyway on principle]. This lets us remove a ton of special-casing code. llvm-svn: 246631
2015-09-02 10:15:05 +00:00 · 2015-09-02 10:15:05 +00:00 · a860a2216a
parent c07701b017
commit a860a2216a
2 changed files with 83 additions and 115 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -468,8 +468,6 @@ protected:
  PHINode *Induction;
  /// The induction variable of the old basic block.
  PHINode *OldInduction;
-  /// Holds the extended (to the widest induction type) start index.
-  Value *ExtendedIdx;
  /// Maps scalars to widened vectors.
  ValueMap WidenMap;
  EdgeMaskCache MaskCache;
@ -2605,9 +2603,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
  // don't. One example is c++ iterators that often have multiple pointer
  // induction variables. In the code below we also support a case where we
  // don't have a single induction variable.
+  //
+  // We try to obtain an induction variable from the original loop as hard
+  // as possible. However if we don't find one that:
+  //   - is an integer
+  //   - counts from zero, stepping by one
+  //   - is the size of the widest induction variable type
+  // then we create a new one.
  OldInduction = Legal->getInduction();
  Type *IdxTy = Legal->getWidestInductionType();
-
+  
  // Find the loop boundaries.
  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
@ -2653,11 +2658,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
                      "min.iters.check", VectorPH->getTerminator());

  Builder.SetInsertPoint(VectorPH->getTerminator());
-  Value *StartIdx = ExtendedIdx = ConstantInt::get(IdxTy, 0);
-
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   VectorPH->getTerminator());
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);

  LoopBypassBlocks.push_back(VectorPH);

@ -2711,24 +2712,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
  setDebugLocFromInst(BypassBuilder,
                      getDebugLocFromInstOrOperands(OldInduction));

-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  if (Count->getType() != IdxTy) {
-    // The exit count can be of pointer type. Convert it to the correct
-    // integer type.
-    if (ExitCount->getType()->isPointerTy())
-      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
-    else
-      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
-  }
-
  // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
+  Value *IdxEnd = BypassBuilder.CreateAdd(ExitCountValue, StartIdx, "end.idx");

  // Now we need to generate the expression for N - (N % VF), which is
  // the part that the vectorized body will execute.
-  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
-  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+  Value *R = BypassBuilder.CreateURem(ExitCountValue, Step, "n.mod.vf");
+  Value *CountRoundDown = BypassBuilder.CreateSub(ExitCountValue, R, "n.vec");
  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
                                                     "end.idx.rnd.down");

@ -2804,7 +2794,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
  // If we come from a bypass edge then we need to start from the original
  // start value.

-  // This variable saves the new starting index for the scalar loop.
+  // This variable saves the new starting index for the scalar loop. It is used
+  // to test if there are any tail iterations left once the vector loop has
+  // completed.
  PHINode *ResumeIndex = nullptr;
  LoopVectorizationLegality::InductionList::iterator I, E;
  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
@ -2814,84 +2806,32 @@ void InnerLoopVectorizer::createEmptyLoop() {
    PHINode *OrigPhi = I->first;
    InductionDescriptor II = I->second;

-    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
-    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
                                         MiddleBlock->getTerminator());
-    // We might have extended the type of the induction variable but we need a
-    // truncated version for the scalar loop.
-    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
-      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
-                      MiddleBlock->getTerminator()) : nullptr;
-
    // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+    PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
+                                           "bc.resume.val",
                                           ScalarPH->getTerminator());
    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);

-    PHINode *BCTruncResumeVal = nullptr;
+    Value *EndValue;
    if (OrigPhi == OldInduction) {
-      BCTruncResumeVal =
-          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
-                          ScalarPH->getTerminator());
-      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
-    }
-
-    Value *EndValue = nullptr;
-    switch (II.getKind()) {
-    case InductionDescriptor::IK_NoInduction:
-      llvm_unreachable("Unknown induction");
-    case InductionDescriptor::IK_IntInduction: {
-      // Handle the integer induction counter.
-      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-
-      // We have the canonical induction variable.
-      if (OrigPhi == OldInduction) {
-        // Create a truncated version of the resume value for the scalar loop,
-        // we might have promoted the type to a larger width.
-        EndValue =
-          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
-        // The new PHI merges the original incoming value, in case of a bypass,
-        // or the value at the end of the vectorized loop.
-        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-          TruncResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
-        TruncResumeVal->addIncoming(EndValue, VecBody);
-
-        BCTruncResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
-
-        // We know what the end value is.
-        EndValue = IdxEndRoundDown;
-        // We also know which PHI node holds it.
-        ResumeIndex = ResumeVal;
-        break;
-      }
-
-      // Not the canonical induction variable - add the vector loop count to the
-      // start value.
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.getStartValue()->getType(),
-                                                   "cast.crd");
-      EndValue = II.transform(BypassBuilder, CRD);
-      EndValue->setName("ind.end");
-      break;
-    }
-    case InductionDescriptor::IK_PtrInduction: {
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
                                                   II.getStepValue()->getType(),
                                                   "cast.crd");
      EndValue = II.transform(BypassBuilder, CRD);
-      EndValue->setName("ptr.ind.end");
-      break;
+      EndValue->setName("ind.end");
    }
-    }// end of case

    // The new PHI merges the original incoming value, in case of a bypass,
    // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
-      if (OrigPhi == OldInduction)
-        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
-      else
-        ResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
-    }
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
    ResumeVal->addIncoming(EndValue, VecBody);

    // Fix the scalar body counter (PHI node).
@ -2899,13 +2839,8 @@ void InnerLoopVectorizer::createEmptyLoop() {

    // The old induction's phi node in the scalar body needs the truncated
    // value.
-    if (OrigPhi == OldInduction) {
-      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
-    } else {
-      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
-    }
+    BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
+    OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
  }

  // If we are generating a new induction variable then we also need to
@ -3526,20 +3461,15 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
      llvm_unreachable("Unknown induction");
    case InductionDescriptor::IK_IntInduction: {
      assert(P->getType() == II.getStartValue()->getType() && "Types must match");
-      Type *PhiTy = P->getType();
-      Value *Broadcasted;
-      if (P == OldInduction) {
-        // Handle the canonical induction variable. We might have had to
-        // extend the type.
-        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
-      } else {
-        // Handle other induction variables that are now based on the
-        // canonical one.
-        auto *V = Builder.CreateSExtOrTrunc(Induction, PhiTy);
-        Broadcasted = II.transform(Builder, V);
-        Broadcasted->setName("offset.idx");
+      // Handle other induction variables that are now based on the
+      // canonical one.
+      Value *V = Induction;
+      if (P != OldInduction) {
+        V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+        V = II.transform(Builder, V);
+        V->setName("offset.idx");
      }
-      Broadcasted = getBroadcastInstrs(Broadcasted);
+      Value *Broadcasted = getBroadcastInstrs(V);
      // After broadcasting the induction variable we need to make the vector
      // consecutive by adding 0, 1, 2, etc.
      for (unsigned part = 0; part < UF; ++part)
@ -3550,17 +3480,15 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
      // Handle the pointer induction variable case.
      assert(P->getType()->isPointerTy() && "Unexpected type.");
      // This is the normalized GEP that starts counting at zero.
-      Value *NormalizedIdx =
-          Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
-      NormalizedIdx =
-          Builder.CreateSExtOrTrunc(NormalizedIdx, II.getStepValue()->getType());
+      Value *PtrInd = Induction;
+      PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
      // This is the vector of results. Notice that we don't generate
      // vector geps because scalar geps result in better code.
      for (unsigned part = 0; part < UF; ++part) {
        if (VF == 1) {
          int EltIndex = part;
-          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
          Value *SclrGep = II.transform(Builder, GlobalIdx);
          SclrGep->setName("next.gep");
          Entry[part] = SclrGep;
@ -3570,8 +3498,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
        for (unsigned int i = 0; i < VF; ++i) {
          int EltIndex = i + part * VF;
-          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
          Value *SclrGep = II.transform(Builder, GlobalIdx);
          SclrGep->setName("next.gep");
          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@ -4239,6 +4167,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
    }
  }

+  // Now we know the widest induction type, check if our found induction
+  // is the same size. If it's not, unset it here and InnerLoopVectorizer
+  // will create another.
+  if (Induction && WidestIndTy != Induction->getType())
+    Induction = nullptr;
+
  return true;
 }

--- a/llvm/test/Transforms/LoopVectorize/ptr-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/ptr-induction.ll
@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; This testcase causes SCEV to return a pointer-typed exit value.
+
+; CHECK: @f
+; Expect that the pointer indvar has been converted into an integer indvar.
+; CHECK: %index.next = add i64 %index, 4
+define i32 @f(i32* readonly %a, i32* readnone %b) #0 {
+entry:
+  %cmp.6 = icmp ult i32* %a, %b
+  br i1 %cmp.6, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %a.pn = phi i32* [ %incdec.ptr8, %while.body ], [ %a, %while.body.preheader ]
+  %acc.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %incdec.ptr8 = getelementptr inbounds i32, i32* %a.pn, i64 1
+  %0 = load i32, i32* %incdec.ptr8, align 1
+  %add = add nuw nsw i32 %0, %acc.07
+  %exitcond = icmp eq i32* %incdec.ptr8, %b
+  br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %acc.0.lcssa = phi i32 [ %add.lcssa, %while.cond.while.end_crit_edge ], [ 0, %entry ]
+  ret i32 %acc.0.lcssa
+}