[ARM][MVE] Remove old tail predicates

Remove any predicate that we replace with a vctp intrinsic, and try to remove their operands too. Also look into the exit block to see if there's any duplicates of the predicates that we've replaced and clone the vctp to be used there instead. Differential Revision: https://reviews.llvm.org/D67709 llvm-svn: 372567
2019-09-23 09:48:25 +00:00 · 2019-09-23 09:48:25 +00:00 · 9feb429a33
parent 14f6465c15
commit 9feb429a33
6 changed files with 672 additions and 12 deletions
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@ -3986,6 +3986,7 @@ def MVE_VDWDUPu8  : MVE_VxWDUP<"vdwdup", "u8",  0b00, 0b1>;
 def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;

+let hasSideEffects = 1 in
 class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
  : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
          "$Rn", vpred_n, "", pattern> {
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@ -84,7 +84,7 @@ private:

  /// Is the icmp that generates an i1 vector, based upon a loop counter
  /// and a limit that is defined outside the loop.
-  bool isTailPredicate(Value *Predicate, Value *NumElements);
+  bool isTailPredicate(Instruction *Predicate, Value *NumElements);
 };

 } // end namespace
@ -178,7 +178,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
  return Changed;
 }

-bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) {
+bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
  // Look for the following:

  // %trip.count.minus.1 = add i32 %N, -1
@ -206,7 +206,7 @@ bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) {
  Instruction *Induction = nullptr;

  // The vector icmp
-  if (!match(V, m_ICmp(Pred, m_Instruction(Induction),
+  if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
                       m_Instruction(Shuffle))) ||
      Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
    return false;
@ -390,6 +390,55 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
  return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
 }

+// Look through the exit block to see whether there's a duplicate predicate
+// instruction. This can happen when we need to perform a select on values
+// from the last and previous iteration. Instead of doing a straight
+// replacement of that predicate with the vctp, clone the vctp and place it
+// in the block. This means that the VPR doesn't have to be live into the
+// exit block which should make it easier to convert this loop into a proper
+// tail predicated loop.
+static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
+                    SetVector<Instruction*> &MaybeDead, Loop *L) {
+  if (BasicBlock *Exit = L->getUniqueExitBlock()) {
+    for (auto &Pair : NewPredicates) {
+      Instruction *OldPred = Pair.first;
+      Instruction *NewPred = Pair.second;
+
+      for (auto &I : *Exit) {
+        if (I.isSameOperationAs(OldPred)) {
+          Instruction *PredClone = NewPred->clone();
+          PredClone->insertBefore(&I);
+          I.replaceAllUsesWith(PredClone);
+          MaybeDead.insert(&I);
+          break;
+        }
+      }
+    }
+  }
+
+  // Drop references and add operands to check for dead.
+  SmallPtrSet<Instruction*, 4> Dead;
+  while (!MaybeDead.empty()) {
+    auto *I = MaybeDead.front();
+    MaybeDead.remove(I);
+    if (I->hasNUsesOrMore(1))
+      continue;
+
+    for (auto &U : I->operands()) {
+      if (auto *OpI = dyn_cast<Instruction>(U))
+        MaybeDead.insert(OpI);
+    }
+    I->dropAllReferences();
+    Dead.insert(I);
+  }
+
+  for (auto *I : Dead)
+    I->eraseFromParent();
+
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
+}
+
 bool MVETailPredication::TryConvert(Value *TripCount) {
  if (!IsPredicatedVectorLoop())
    return false;
@ -400,13 +449,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
  // operand is generated from an induction variable.
  Module *M = L->getHeader()->getModule();
  Type *Ty = IntegerType::get(M->getContext(), 32);
-  SmallPtrSet<Value*, 4> Predicates;
+  SetVector<Instruction*> Predicates;
+  DenseMap<Instruction*, Instruction*> NewPredicates;

  for (auto *I : MaskedInsts) {
    Intrinsic::ID ID = I->getIntrinsicID();
    unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
-    Value *Predicate = I->getArgOperand(PredOp);
-    if (Predicates.count(Predicate))
+    auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
+    if (!Predicate || Predicates.count(Predicate))
      continue;

    VectorType *VecTy = getVectorType(I);
@ -445,6 +495,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
    Value *Remaining = Builder.CreateSub(Processed, Factor);
    Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
    Predicate->replaceAllUsesWith(TailPredicate);
+    NewPredicates[Predicate] = cast<Instruction>(TailPredicate);

    // Add the incoming value to the new phi.
    Processed->addIncoming(Remaining, L->getLoopLatch());
@ -453,9 +504,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
               << "TP: Inserted VCTP: " << *TailPredicate << "\n");
  }

-  for (auto I : L->blocks())
-    DeleteDeadPHIs(I);
-
+  // Now clean up.
+  Cleanup(NewPredicates, Predicates, L);
  return true;
 }

--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@ -0,0 +1,292 @@
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: vpsel_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32
+; CHECK:      vpsel
+; CHECK:      vldr p0, [sp
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %rem = urem i32 %index, 16
+  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+  %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
+  %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
+  %add = add nsw <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %tmp7 = icmp eq i32 %index.next, %n.vec
+  br i1 %tmp7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vpsel_mul_reduce_add_2
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK;      vsub
+; CHECK:      vpst
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32
+; CHECK:      vpsel
+; CHECK:      vldr p0, [sp
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %rem = urem i32 %index, 16
+  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+  %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
+  %mul = mul  <4 x i32> %sel, %wide.masked.load.a
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: and_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vpsttt
+; CHECK-NEXT: vcmpt.i32	eq, {{.*}}, zr
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+  %mask = and <4 x i1> %cmp, %tmp1
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
+; CHECK-LABEL: or_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32	eq, {{.*}}, zr
+; CHECK:      vmrs [[VCMP:r[0-9]+]], p0
+; CHECK:      vldr p0, [sp
+; CHECK:      vmrs [[VCTP:r[0-9]+]], p0
+; CHECK:      orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK-NEXT: vmsr p0
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                        i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+  %mask = or <4 x i1> %cmp, %tmp1
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; Function Attrs: argmemonly nounwind readonly willreturn
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+; Function Attrs: nounwind readnone willreturn
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@ -0,0 +1,242 @@
+; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: mul_reduce_add
+; CHECK:      dls lr,
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      le	lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vpsel
+; CHECK:      vaddv.u32	r0
+define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+  %6 = add nsw <4 x i32> %5, %vec.phi
+  %index.next = add i32 %index, 4
+  %7 = icmp eq i32 %index.next, %n.vec
+  br i1 %7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
+  %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+  %index.next = add i32 %index, 4
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: add_reduce_add_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      vadd.i32
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+  %index.next = add i32 %index, 4
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vector_mul_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK:      vmul.i32
+; CHECK:      vpst	
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: vector_add_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK:      vadd.i32
+; CHECK:      vpst	
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@ -0,0 +1,75 @@
+
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: vec_mul_reduce_add
+
+; CHECK: vector.body:
+; CHECK-NOT: phi i32 [ 0, %vector.ph ]
+; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
+
+; CHECK: middle.block:
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
+; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
+
+define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+  
+vector.ph:                                        ; preds = %entry
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+  
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
+  %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+  %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
+  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
+  %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+  %9 = add nsw <4 x i32> %8, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+  %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+  %11 = icmp ne i32 %10, 0
+  br i1 %11, label %vector.body, label %middle.block
+  
+middle.block:                                     ; preds = %vector.body
+  %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
+  %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
+  br label %for.cond.cleanup
+  
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+  
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+  
--- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
@ -4,8 +4,8 @@
 define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
 ; CHECK-LABEL: vctp8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]
@ -20,8 +20,8 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
 define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
 ; CHECK-LABEL: vctp16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]
@ -36,8 +36,8 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
 define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
 ; CHECK-LABEL: vctp32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]