[LSR] Generate cross iteration indexes

Modify GenerateConstantOffsetsImpl to create offsets that can be used by indexed addressing modes. If formulae can be generated which result in the constant offset being the same size as the recurrence, we can generate a pre-indexed access. This allows the pointer to be updated via the single pre-indexed access so that (hopefully) no add/subs are required to update it for the next iteration. For small cores, this can significantly improve performance DSP-like loops. Differential Revision: https://reviews.llvm.org/D55373 llvm-svn: 353403
2019-02-07 13:32:54 +00:00 · 2019-02-07 13:32:54 +00:00 · 67756c09f2
parent bb3b372aa1
commit 67756c09f2
9 changed files with 1598 additions and 40 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -486,6 +486,10 @@ public:
  /// addressing mode expressions.
  bool shouldFavorPostInc() const;

+  /// Return true if LSR should make efforts to generate indexed addressing
+  /// modes that operate across loop iterations.
+  bool shouldFavorBackedgeIndex(const Loop *L) const;
+
  /// Return true if the target supports masked load/store
  /// AVX2 and AVX-512 targets allow masks for consecutive load and store
  bool isLegalMaskedStore(Type *DataType) const;
@ -1065,6 +1069,7 @@ public:
                             TargetTransformInfo::LSRCost &C2) = 0;
  virtual bool canMacroFuseCmp() = 0;
  virtual bool shouldFavorPostInc() const = 0;
+  virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
  virtual bool isLegalMaskedStore(Type *DataType) = 0;
  virtual bool isLegalMaskedLoad(Type *DataType) = 0;
  virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@ -1301,6 +1306,9 @@ public:
  bool shouldFavorPostInc() const override {
    return Impl.shouldFavorPostInc();
  }
+  bool shouldFavorBackedgeIndex(const Loop *L) const override {
+    return Impl.shouldFavorBackedgeIndex(L);
+  }
  bool isLegalMaskedStore(Type *DataType) override {
    return Impl.isLegalMaskedStore(DataType);
  }
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -253,6 +253,8 @@ public:

  bool shouldFavorPostInc() const { return false; }

+  bool shouldFavorBackedgeIndex(const Loop *L) const { return false; }
+
  bool isLegalMaskedStore(Type *DataType) { return false; }

  bool isLegalMaskedLoad(Type *DataType) { return false; }
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -162,6 +162,10 @@ bool TargetTransformInfo::shouldFavorPostInc() const {
  return TTIImpl->shouldFavorPostInc();
 }

+bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const {
+  return TTIImpl->shouldFavorBackedgeIndex(L);
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
  return TTIImpl->isLegalMaskedStore(DataType);
 }
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@ -93,6 +93,12 @@ public:

  bool enableInterleavedAccessVectorization() { return true; }

+  bool shouldFavorBackedgeIndex(const Loop *L) const {
+    if (L->getHeader()->getParent()->optForSize())
+      return false;
+    return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
+  }
+
  /// Floating-point computation using ARMv8 AArch32 Advanced
  /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
  /// is IEEE-754 compliant, but it's not covered in this target.
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -154,6 +154,10 @@ static cl::opt<bool> FilterSameScaledReg(
    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
             " with the same ScaledReg and Scale"));

+static cl::opt<bool> EnableBackedgeIndexing(
+  "lsr-backedge-indexing", cl::Hidden, cl::init(true),
+  cl::desc("Enable the generation of cross iteration indexed memops"));
+
 static cl::opt<unsigned> ComplexityLimit(
  "lsr-complexity-limit", cl::Hidden,
  cl::init(std::numeric_limits<uint16_t>::max()),
@ -1052,12 +1056,12 @@ public:
  void dump() const;

 private:
-  void RateRegister(const SCEV *Reg,
+  void RateRegister(const Formula &F, const SCEV *Reg,
                    SmallPtrSetImpl<const SCEV *> &Regs,
                    const Loop *L,
                    ScalarEvolution &SE, DominatorTree &DT,
                    const TargetTransformInfo &TTI);
-  void RatePrimaryRegister(const SCEV *Reg,
+  void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                           SmallPtrSetImpl<const SCEV *> &Regs,
                           const Loop *L,
                           ScalarEvolution &SE, DominatorTree &DT,
@ -1208,7 +1212,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                 Instruction *Fixup = nullptr);

 /// Tally up interesting quantities from the given register.
-void Cost::RateRegister(const SCEV *Reg,
+void Cost::RateRegister(const Formula &F, const SCEV *Reg,
                        SmallPtrSetImpl<const SCEV *> &Regs,
                        const Loop *L,
                        ScalarEvolution &SE, DominatorTree &DT,
@ -1235,16 +1239,24 @@ void Cost::RateRegister(const SCEV *Reg,
    }

    unsigned LoopCost = 1;
-    if (TTI.shouldFavorPostInc()) {
-      const SCEV *LoopStep = AR->getStepRecurrence(SE);
-      if (isa<SCEVConstant>(LoopStep)) {
-        // Check if a post-indexed load/store can be used.
-        if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
-            TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+    if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+        TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+
+      // If the step size matches the base offset, we could use pre-indexed
+      // addressing.
+      if (TTI.shouldFavorBackedgeIndex(L)) {
+        if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+          if (Step->getAPInt() == F.BaseOffset)
+            LoopCost = 0;
+      }
+
+      if (TTI.shouldFavorPostInc()) {
+        const SCEV *LoopStep = AR->getStepRecurrence(SE);
+        if (isa<SCEVConstant>(LoopStep)) {
          const SCEV *LoopStart = AR->getStart();
          if (!isa<SCEVConstant>(LoopStart) &&
-            SE.isLoopInvariant(LoopStart, L))
-              LoopCost = 0;
+              SE.isLoopInvariant(LoopStart, L))
+            LoopCost = 0;
        }
      }
    }
@ -1254,7 +1266,7 @@ void Cost::RateRegister(const SCEV *Reg,
    // TODO: The non-affine case isn't precisely modeled here.
    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
      if (!Regs.count(AR->getOperand(1))) {
-        RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
+        RateRegister(F, AR->getOperand(1), Regs, L, SE, DT, TTI);
        if (isLoser())
          return;
      }
@ -1278,7 +1290,7 @@ void Cost::RateRegister(const SCEV *Reg,
 /// Record this register in the set. If we haven't seen it before, rate
 /// it. Optional LoserRegs provides a way to declare any formula that refers to
 /// one of those regs an instant loser.
-void Cost::RatePrimaryRegister(const SCEV *Reg,
+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                               SmallPtrSetImpl<const SCEV *> &Regs,
                               const Loop *L,
                               ScalarEvolution &SE, DominatorTree &DT,
@ -1289,7 +1301,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
    return;
  }
  if (Regs.insert(Reg).second) {
-    RateRegister(Reg, Regs, L, SE, DT, TTI);
+    RateRegister(F, Reg, Regs, L, SE, DT, TTI);
    if (LoserRegs && isLoser())
      LoserRegs->insert(Reg);
  }
@ -1313,7 +1325,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
      Lose();
      return;
    }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
+    RatePrimaryRegister(F, ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
    if (isLoser())
      return;
  }
@ -1322,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
      Lose();
      return;
    }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
+    RatePrimaryRegister(F, BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
    if (isLoser())
      return;
  }
@ -1889,6 +1901,7 @@ class LSRInstance {
  LoopInfo &LI;
  const TargetTransformInfo &TTI;
  Loop *const L;
+  bool FavorBackedgeIndex = false;
  bool Changed = false;

  /// This is the insert position that the current loop's induction variable
@ -2803,7 +2816,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// TODO: Consider IVInc free if it's already used in another chains.
 static bool
 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
-                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+                  ScalarEvolution &SE) {
  if (StressIVChain)
    return true;

@ -3063,7 +3076,7 @@ void LSRInstance::CollectChains() {
  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
       UsersIdx < NChains; ++UsersIdx) {
    if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+                           ChainUsersVec[UsersIdx].FarUsers, SE))
      continue;
    // Preserve the chain at UsesIdx.
    if (ChainIdx != UsersIdx)
@ -3077,7 +3090,7 @@ void LSRInstance::CollectChains() {
 void LSRInstance::FinalizeChain(IVChain &Chain) {
  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
  LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
-
+  
  for (const IVInc &Inc : Chain) {
    LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
    auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
@ -3737,10 +3750,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateConstantOffsetsImpl(
    LSRUse &LU, unsigned LUIdx, const Formula &Base,
    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
-  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
-  for (int64_t Offset : Worklist) {
+
+  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
    Formula F = Base;
    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+
    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
                   LU.AccessTy, F)) {
      // Add the offset to the base register.
@ -3760,7 +3774,35 @@ void LSRInstance::GenerateConstantOffsetsImpl(

      (void)InsertFormula(LU, LUIdx, F);
    }
+  };
+
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+  // With constant offsets and constant steps, we can generate pre-inc
+  // accesses by having the offset equal the step. So, for access #0 with a
+  // step of 8, we generate a G - 8 base which would require the first access
+  // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
+  // for itself and hopefully becomes the base for other accesses. This means
+  // means that a single pre-indexed access can be generated to become the new
+  // base pointer for each iteration of the loop, resulting in no extra add/sub
+  // instructions for pointer updating.
+  if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
+    if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
+      if (auto *StepRec =
+          dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
+        const APInt &StepInt = StepRec->getAPInt();
+        int64_t Step = StepInt.isNegative() ?
+          StepInt.getSExtValue() : StepInt.getZExtValue();
+
+        for (int64_t Offset : Worklist) {
+          Offset -= Step;
+          GenerateOffset(G, Offset);
+        }
+      }
+    }
  }
+  for (int64_t Offset : Worklist)
+    GenerateOffset(G, Offset);

  int64_t Imm = ExtractImmediate(G, SE);
  if (G->isZero() || Imm == 0)
@ -4417,7 +4459,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
 /// When there are many registers for expressions like A, A+1, A+2, etc.,
 /// allocate a single register for them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
-  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
    return;

  LLVM_DEBUG(
@ -5378,7 +5420,9 @@ void LSRInstance::ImplementSolution(
 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                         DominatorTree &DT, LoopInfo &LI,
                         const TargetTransformInfo &TTI)
-    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L),
+      FavorBackedgeIndex(EnableBackedgeIndexing &&
+                         TTI.shouldFavorBackedgeIndex(L)) {
  // If LoopSimplify form is not available, stay out of trouble.
  if (!L->isLoopSimplifyForm())
    return;
--- a/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll
+++ b/llvm/test/CodeGen/ARM/dsp-loop-indexing.ll
@ -0,0 +1,310 @@
+; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX
+
+; CHECK-LABEL: test_qadd_2
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEAFULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: str{{.*}}, #8]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: str{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_2_backwards
+; TODO: Indexes should be generated.
+
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}},
+; CHECK-DEFAULT: ldr{{.*}},
+; CHECK-DEFAULT: str{{.*}},
+; CHECK-DEFAULT: ldr{{.*}}, #-4]
+; CHECK-DEFAULT: ldr{{.*}}, #-4]
+; CHECK-DEFAULT: sub{{.*}}, #8
+; CHECK-DEFAULT: str{{.*}}, #-4]
+; CHECK-DEFAULT: sub{{.*}}, #8
+
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = sub nsw nuw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = sub nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_3
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #12]!
+; CHECK-DEFAULT: ldr{{.*}}, #12]!
+; CHECK-DEFAULT: str{{.*}}, #12]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #12]!
+; CHECK-COMPLEX: ldr{{.*}}, #12]!
+; CHECK-COMPLEX: str{{.*}}, #12]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: str{{.*}}, #8]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nuw nsw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = add nuw nsw i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %i.next = add nsw nuw i32 %i, -3
+  %idx.next = add nsw nuw i32 %idx.1, 3
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_4
+; CHECK: @ %loop
+
+; TODO: pre-inc store
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: str{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #12]
+; CHECK-COMPLEX: ldr{{.*}}, #12]
+; CHECK-COMPLEX: str{{.*}}, #12]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = or i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %idx.4 = or i32 %idx.1, 3
+  %gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4
+  %a.4 = load i32, i32* %gep.a.4
+  %gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4
+  %b.4 = load i32, i32* %gep.b.4
+  %qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4)
+  %addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4
+  store i32 %qadd.4, i32* %addr.4
+  %i.next = add nsw nuw i32 %i, -4
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd16_2
+; CHECK: @ %loop
+; TODO: pre-inc store.
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: str{{.*}}, #16]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #8]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
+  %cast.a.1 = bitcast i16* %gep.a.1 to i32*
+  %a.1 = load i32, i32* %cast.a.1
+  %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
+  %cast.b.1 = bitcast i16* %gep.b.1 to i32*
+  %b.1 = load i32, i32* %cast.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nsw nuw i32 %idx.1, 2
+  %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
+  %cast.a.2 = bitcast i16* %gep.a.2 to i32*
+  %a.2 = load i32, i32* %cast.a.2
+  %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
+  %cast.b.2 = bitcast i16* %gep.b.2 to i32*
+  %b.2 = load i32, i32* %cast.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare i32 @llvm.arm.qadd(i32, i32)
+declare i32 @llvm.arm.qadd16(i32, i32)
--- a/llvm/test/CodeGen/ARM/loop-align-cortex-m.ll
+++ b/llvm/test/CodeGen/ARM/loop-align-cortex-m.ll
@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s

 define void @test_loop_alignment(i32* %in, i32*  %out) optsize {
 ; CHECK-LABEL: test_loop_alignment:
-; CHECK: movs {{r[0-9]+}}, #0
+; CHECK: mov{{.*}}, #0
 ; CHECK: .p2align 2

 entry:
--- a/llvm/test/CodeGen/ARM/loop-indexing.ll
+++ b/llvm/test/CodeGen/ARM/loop-indexing.ll
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
@ -1,21 +1,15 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"

-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s

-; CHECK-DEFAULT-LABEL: for.body12.us.us:
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8
-
-; CHECK-COMPLEX-LABEL: for.body12.us.us:
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
-; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
+; CHECK-LABEL: for.body12.us.us:
+; CHECK: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK: phi i32
+; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK: phi i32
+; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4

 define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) {
 entry: