[LV] Try to sink users recursively for first-order recurrences.

Update isFirstOrderRecurrence to explore all uses of a recurrence phi and check if we can sink them. If there are multiple users to sink, they are all mapped to the previous instruction. Fixes PR44286 (and another PR or two). Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D84951
2021-05-31 12:06:28 +01:00 · 2021-05-31 12:06:28 +01:00 · aa00b1d763
parent 73b759a33a
commit aa00b1d763
7 changed files with 252 additions and 68 deletions
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@ -14,6 +14,7 @@
 #define LLVM_ANALYSIS_IVDESCRIPTORS_H

 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@ -174,7 +175,7 @@ public:
  /// to handle Phi as a first-order recurrence.
  static bool
  isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                         DenseMap<Instruction *, Instruction *> &SinkAfter,
+                         MapVector<Instruction *, Instruction *> &SinkAfter,
                         DominatorTree *DT);

  RecurKind getRecurrenceKind() const { return Kind; }
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@ -295,7 +295,7 @@ public:
  RecurrenceSet &getFirstOrderRecurrences() { return FirstOrderRecurrences; }

  /// Return the set of instructions to sink to handle first-order recurrences.
-  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+  MapVector<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }

  /// Returns the widest induction type.
  Type *getWidestInductionType() { return WidestIndTy; }
@ -518,7 +518,7 @@ private:

  /// Holds instructions that need to sink past other instructions to handle
  /// first-order recurrences.
-  DenseMap<Instruction *, Instruction *> SinkAfter;
+  MapVector<Instruction *, Instruction *> SinkAfter;

  /// Holds the widest induction type encountered.
  Type *WidestIndTy = nullptr;
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@ -34,6 +34,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"

+#include <set>
+
 using namespace llvm;
 using namespace llvm::PatternMatch;

@ -715,7 +717,7 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,

 bool RecurrenceDescriptor::isFirstOrderRecurrence(
    PHINode *Phi, Loop *TheLoop,
-    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
+    MapVector<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {

  // Ensure the phi node is in the loop header and has two incoming values.
  if (Phi->getParent() != TheLoop->getHeader() ||
@ -741,51 +743,76 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
    return false;

-  // Ensure every user of the phi node is dominated by the previous value.
-  // The dominance requirement ensures the loop vectorizer will not need to
-  // vectorize the initial value prior to the first iteration of the loop.
-  // TODO: Consider extending this sinking to handle memory instructions and
-  // phis with multiple users.
+  // Ensure every user of the phi node (recursively) is dominated by the
+  // previous value. The dominance requirement ensures the loop vectorizer will
+  // not need to vectorize the initial value prior to the first iteration of the
+  // loop.
+  // TODO: Consider extending this sinking to handle memory instructions.

-  // Returns true, if all users of I are dominated by DominatedBy.
-  auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
-    return all_of(I->uses(), [DT, DominatedBy](Use &U) {
-      return DT->dominates(DominatedBy, U);
-    });
+  // We optimistically assume we can sink all users after Previous. Keep a set
+  // of instructions to sink after Previous ordered by dominance in the common
+  // basic block. It will be applied to SinkAfter if all users can be sunk.
+  auto CompareByComesBefore = [](const Instruction *A, const Instruction *B) {
+    return A->comesBefore(B);
  };
+  std::set<Instruction *, decltype(CompareByComesBefore)> InstrsToSink(
+      CompareByComesBefore);

-  if (Phi->hasOneUse()) {
-    Instruction *I = Phi->user_back();
+  BasicBlock *PhiBB = Phi->getParent();
+  SmallVector<Instruction *, 8> WorkList;
+  auto TryToPushSinkCandidate = [&](Instruction *SinkCandidate) {
+    // Already sunk SinkCandidate.
+    if (SinkCandidate->getParent() == PhiBB &&
+        InstrsToSink.find(SinkCandidate) != InstrsToSink.end())
+      return true;

-    // If the user of the PHI is also the incoming value, we potentially have a
-    // reduction and which cannot be handled by sinking.
-    if (Previous == I)
+    // Cyclic dependence.
+    if (Previous == SinkCandidate)
      return false;

-    // We cannot sink terminator instructions.
-    if (I->getParent()->getTerminator() == I)
+    if (DT->dominates(Previous,
+                      SinkCandidate)) // We already are good w/o sinking.
+      return true;
+
+    if (SinkCandidate->getParent() != PhiBB ||
+        SinkCandidate->mayHaveSideEffects() ||
+        SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator())
      return false;

    // Do not try to sink an instruction multiple times (if multiple operands
    // are first order recurrences).
    // TODO: We can support this case, by sinking the instruction after the
    // 'deepest' previous instruction.
-    if (SinkAfter.find(I) != SinkAfter.end())
+    if (SinkAfter.find(SinkCandidate) != SinkAfter.end())
      return false;

-    if (DT->dominates(Previous, I)) // We already are good w/o sinking.
+    // If we reach a PHI node that is not dominated by Previous, we reached a
+    // header PHI. No need for sinking.
+    if (isa<PHINode>(SinkCandidate))
      return true;

-    // We can sink any instruction without side effects, as long as all users
-    // are dominated by the instruction we are sinking after.
-    if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
-        allUsesDominatedBy(I, Previous)) {
-      SinkAfter[I] = Previous;
-      return true;
+    // Sink User tentatively and check its users
+    InstrsToSink.insert(SinkCandidate);
+    WorkList.push_back(SinkCandidate);
+    return true;
+  };
+
+  WorkList.push_back(Phi);
+  // Try to recursively sink instructions and their users after Previous.
+  while (!WorkList.empty()) {
+    Instruction *Current = WorkList.pop_back_val();
+    for (User *User : Current->users()) {
+      if (!TryToPushSinkCandidate(cast<Instruction>(User)))
+        return false;
    }
  }

-  return allUsesDominatedBy(Phi, Previous);
+  // We can sink all users of Phi. Update the mapping.
+  for (Instruction *I : InstrsToSink) {
+    SinkAfter[I] = Previous;
+    Previous = I;
+  }
+  return true;
 }

 /// This function returns the identity element (or neutral element) for
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@ -344,7 +344,7 @@ private:
  /// Legal. This method is only used for the legacy inner loop vectorizer.
  VPlanPtr buildVPlanWithVPRecipes(
      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
-      const DenseMap<Instruction *, Instruction *> &SinkAfter);
+      const MapVector<Instruction *, Instruction *> &SinkAfter);

  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
  /// according to the information gathered by Legal when it checked if it is
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -8994,7 +8994,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
  auto &ConditionalAssumes = Legal->getConditionalAssumes();
  DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());

-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+  MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
  // Dead instructions do not need sinking. Remove them from SinkAfter.
  for (Instruction *I : DeadInstructions)
    SinkAfter.erase(I);
@ -9010,7 +9010,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
-    const DenseMap<Instruction *, Instruction *> &SinkAfter) {
+    const MapVector<Instruction *, Instruction *> &SinkAfter) {

  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@ -173,20 +173,54 @@ exit:
  ret void
 }

-; FIXME: Currently we can only sink a single instruction. For the example below,
-;        we also have to sink users.
-define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
-; CHECK-LABEL: @cannot_sink_with_additional_user(
+; Sink users of %pre.phi recursively.
+define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
+; CHECK-LABEL: @can_sink_with_additional_user(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[IDX_PHI_TRANS:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[IDX_PHI_TRANS]], align 4
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[DOTPRE]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1999, 1996
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
-; CHECK-NEXT:    [[PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[PRE_PHI]], [[X:%.*]]
+; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[SCALAR_RECUR]], [[X]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[X]]
 ; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[IV]]
 ; CHECK-NEXT:    [[PRE_NEXT]] = load i32, i32* [[IDX_1]], align 4
@ -196,14 +230,13 @@ define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
 ; CHECK-NEXT:    store i32 [[ADD_4]], i32* [[IDX_2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;



-
 entry:
  br label %preheader

@ -378,7 +411,9 @@ bb74:                                             ; preds = %bb13
  ret void
 }

-; Users that are phi nodes cannot be sunk.
+; The (first) reason `%first_time.1` cannot be sunk is because it appears outside
+; the header and is not dominated by Previous. The fact that it feeds Previous
+; is a second sinking-preventing reason.
 define void @cannot_sink_phi(i32* %ptr) {
 ; CHECK-LABEL: @cannot_sink_phi(
 ; CHECK-NEXT:  entry:
@ -407,19 +442,19 @@ define void @cannot_sink_phi(i32* %ptr) {
 entry:
  br label %loop.header

-loop.header:                                      ; preds = %if.end128, %for.cond108.preheader
+loop.header:
  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
  %for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ]
  %c.1 = icmp ult i64 %iv, 500
  br i1 %c.1, label %if.truebb, label %if.falsebb

-if.truebb:                  ; preds = %for.body114
+if.truebb:
  br label %loop.latch

-if.falsebb:                                       ; preds = %for.body114
+if.falsebb:
  br label %loop.latch

-loop.latch:                                        ; preds = %if.then122, %for.body114.if.end128_crit_edge
+loop.latch:
  %first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ]
  %c.2 = icmp ult i64 %iv, 800
  %for.next = select i1 %c.2, i32 30, i32 %first_time.1
@ -606,20 +641,67 @@ if.end:
 define void @sink_dominance(i32* %ptr, i32 %N) {
 ; CHECK-LABEL: @sink_dominance(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[UMAX]], -1
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt <4 x i32> [[TMP14]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP14]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32
+; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 22
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]]
 ; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[FOR_NEXT]] = zext i32 [[LV]] to i64
 ; CHECK-NEXT:    store i32 [[SELECT]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@ -652,22 +734,71 @@ exit:
 define void @sink_dominance_2(i32* %ptr, i32 %N) {
 ; CHECK-LABEL: @sink_dominance_2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[UMAX]], -1
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP14]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[TMP15]], <i32 99, i32 99, i32 99, i32 99>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <4 x i32> [[TMP14]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32
+; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32
 ; CHECK-NEXT:    [[STEP:%.*]] = add i32 [[FOR_TRUNC]], 2
 ; CHECK-NEXT:    [[STEP_2:%.*]] = mul i32 [[STEP]], 99
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 [[STEP_2]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]]
 ; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[FOR_NEXT]] = zext i32 [[LV]] to i64
 ; CHECK-NEXT:    store i32 [[SELECT]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@ -545,11 +545,36 @@ for.end:
 ;     b[i] = ((a[i] + 2) * a[i + 1]);
 ; }
 ;
-; NO-SINK-AFTER-LABEL: no_sink_after
-; NO-SINK-AFTER-NOT:   vector.ph:
-; NO-SINK-AFTER:       }
+; SINK-AFTER-LABEL:  @sink_after_with_multiple_users
+; SINK-AFTER-LABEL:  vector.ph:
+; SINK-AFTER-NEXT:    %n.mod.vf = urem i64 %n, 4
+; SINK-AFTER-NEXT:    %n.vec = sub i64 %n, %n.mod.vf
+; SINK-AFTER-NEXT:    %vector.recur.init = insertelement <4 x i16> poison, i16 %.pre, i32 3
+; SINK-AFTER-NEXT:    br label %vector.body
+
+; SINK-AFTER-LABEL:  vector.body:
+; SINK-AFTER-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; SINK-AFTER-NEXT:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER-NEXT:    %1 = add i64 %index, 0
+; SINK-AFTER-NEXT:    %2 = add nuw nsw i64 %1, 1
+; SINK-AFTER-NEXT:    %3 = getelementptr inbounds i16, i16* %a, i64 %2
+; SINK-AFTER-NEXT:    %4 = getelementptr inbounds i16, i16* %3, i32 0
+; SINK-AFTER-NEXT:    %5 = bitcast i16* %4 to <4 x i16>*
+; SINK-AFTER-NEXT:    %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43
+; SINK-AFTER-NEXT:    %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    %7 = sext <4 x i16> %6 to <4 x i32>
+; SINK-AFTER-NEXT:    %8 = add nsw <4 x i32> %7, <i32 2, i32 2, i32 2, i32 2>
+; SINK-AFTER-NEXT:    %9 = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER-NEXT:    %10 = mul nsw <4 x i32> %8, %9
+; SINK-AFTER-NEXT:    %11 = getelementptr inbounds i32, i32* %b, i64 %1
+; SINK-AFTER-NEXT:    %12 = getelementptr inbounds i32, i32* %11, i32 0
+; SINK-AFTER-NEXT:    %13 = bitcast i32* %12 to <4 x i32>*
+; SINK-AFTER-NEXT:    store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT:    %index.next = add i64 %index, 4
+; SINK-AFTER-NEXT:    %14 = icmp eq i64 %index.next, %n.vec
+; SINK-AFTER-NEXT:    br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48
 ;
-define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
 entry:
  %.pre = load i16, i16* %a
  br label %for.body
@ -626,7 +651,7 @@ define void @sink_dead_inst() {
 ; SINK-AFTER-NEXT:    %index.next = add i32 %index, 4
 ; SINK-AFTER-NEXT:    %vec.ind.next = add <4 x i16> %vec.ind, <i16 4, i16 4, i16 4, i16 4>
 ; SINK-AFTER-NEXT:    %6 = icmp eq i32 %index.next, 40
-; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43
+; SINK-AFTER-NEXT:    br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50
 ;
 entry:
  br label %for.cond
@ -708,7 +733,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP46:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
@ -719,7 +744,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP48:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP48:!llvm.loop !.*]]
 ;
 bb:
  br label %bb2
@ -835,7 +860,7 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP49:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]])
@ -846,7 +871,7 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP50:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP50:!llvm.loop !.*]]
 ;
 bb:
  br label %bb2