[ARM] Fix MVE gather/scatter merged gep offsets

This fixes the combining of constant vector GEP operands in the optimization of MVE gather/scatter addresses, when opaque pointers are enabled. As opaque pointers reduce the number of bitcasts between geps, more can be folded than before. This can cause problems if the index types are now different between the two geps. This fixes that by making sure each constant is scaled appropriately, which has the effect of transforming the geps to have a scale of 1, changing [r0, q0, uxtw #1] gathers to [r0, q0] with a larger q0. This helps use a simpler instruction that doesn't need the extra uxtw. Differential Revision: https://reviews.llvm.org/D127733
2022-06-22 11:04:22 +01:00 · 2022-06-22 11:04:22 +01:00 · 979400be78
parent f98697642c
commit 979400be78
7 changed files with 169 additions and 150 deletions
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@ -145,7 +145,8 @@ private:
  // Optimise the base and offsets of the given address
  bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
  // Try to fold consecutive geps together into one
-  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
+  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale,
+                 IRBuilder<> &Builder);
  // Check whether these offsets could be moved out of the loop they're in
  bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
  // Pushes the given add out of the loop
@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
  return true;
 }

-static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
-                                      IRBuilder<> &Builder) {
+static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y,
+                                      unsigned ScaleY, IRBuilder<> &Builder) {
  // Splat the non-vector value to a vector of the given type - if the value is
  // a constant (and its value isn't too big), we can even use this opportunity
  // to scale it to the size of the vector elements
@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
      ConstantInt *ConstYEl =
          dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
      if (!ConstXEl || !ConstYEl ||
-          ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+          ConstXEl->getZExtValue() * ScaleX +
+                  ConstYEl->getZExtValue() * ScaleY >=
              (unsigned)(1 << (TargetElemSize - 1)))
        return nullptr;
    }
  }

-  Value *Add = Builder.CreateAdd(X, Y);
+  Value *XScale = Builder.CreateVectorSplat(
+      XElType->getNumElements(),
+      Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX));
+  Value *YScale = Builder.CreateVectorSplat(
+      YElType->getNumElements(),
+      Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY));
+  Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale),
+                                 Builder.CreateMul(Y, YScale));

-  FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
-  if (checkOffsetSize(Add, GEPType->getNumElements()))
+  if (checkOffsetSize(Add, XElType->getNumElements()))
    return Add;
  else
    return nullptr;
 }

 Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
-                                         Value *&Offsets,
+                                         Value *&Offsets, unsigned &Scale,
                                         IRBuilder<> &Builder) {
  Value *GEPPtr = GEP->getPointerOperand();
  Offsets = GEP->getOperand(1);
+  Scale = DL->getTypeAllocSize(GEP->getSourceElementType());
  // We only merge geps with constant offsets, because only for those
  // we can make sure that we do not cause an overflow
-  if (!isa<Constant>(Offsets))
+  if (GEP->getNumIndices() != 1 || !isa<Constant>(Offsets))
    return nullptr;
-  GetElementPtrInst *BaseGEP;
-  if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+  if (GetElementPtrInst *BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr)) {
    // Merge the two geps into one
-    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder);
    if (!BaseBasePtr)
      return nullptr;
-    Offsets =
-        CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+    Offsets = CheckAndCreateOffsetAdd(
+        Offsets, Scale, GEP->getOperand(1),
+        DL->getTypeAllocSize(GEP->getSourceElementType()), Builder);
    if (Offsets == nullptr)
      return nullptr;
+    Scale = 1; // Scale is always an i8 at this point.
    return BaseBasePtr;
  }
  return GEPPtr;
@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
    Builder.SetInsertPoint(GEP);
    Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
    Value *Offsets;
-    Value *Base = foldGEP(GEP, Offsets, Builder);
+    unsigned Scale;
+    Value *Base = foldGEP(GEP, Offsets, Scale, Builder);
    // We only want to merge the geps if there is a real chance that they can be
    // used by an MVE gather; thus the offset has to have the correct size
    // (always i32 if it is not of vector type) and the base has to be a
    // pointer.
    if (Offsets && Base && Base != GEP) {
+      assert(Scale == 1 && "Expected to fold GEP to a scale of 1");
+      Type *BaseTy = Builder.getInt8PtrTy();
+      if (auto *VecTy = dyn_cast<FixedVectorType>(Base->getType()))
+        BaseTy = FixedVectorType::get(BaseTy, VecTy);
      GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
-          GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
-      GEP->replaceAllUsesWith(NewAddress);
+          Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
+          "gep.merged", GEP);
+      LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
+                        << "\n      new :  " << *NewAddress << "\n");
+      GEP->replaceAllUsesWith(
+          Builder.CreateBitCast(NewAddress, GEP->getType()));
      GEP = NewAddress;
      Changed = true;
    }
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@ -294,19 +294,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI14_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
  %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20
@ -319,19 +319,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI15_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI15_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
  %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
@ -318,15 +318,15 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI21_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI21_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 32 @ 0x20
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 56 @ 0x38
 entry:
  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@ -856,15 +856,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(i8* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI31_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI31_0:
-; CHECK-OPAQ-NEXT:    .long 4294967295 @ 0xffffffff
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 31 @ 0x1f
-; CHECK-OPAQ-NEXT:    .long 47 @ 0x2f
+; CHECK-OPAQ-NEXT:    .long 4294967292 @ 0xfffffffc
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 28 @ 0x1c
+; CHECK-OPAQ-NEXT:    .long 44 @ 0x2c
  %a = getelementptr i8, i8* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
  %b = bitcast <4 x i8*> %a to <4 x i32*>
  %c = getelementptr inbounds i32, <4 x i32*> %b, i32 -1
@ -892,15 +892,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(i16* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI32_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI32_0:
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 5 @ 0x5
-; CHECK-OPAQ-NEXT:    .long 29 @ 0x1d
-; CHECK-OPAQ-NEXT:    .long 235 @ 0xeb
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 18 @ 0x12
+; CHECK-OPAQ-NEXT:    .long 58 @ 0x3a
+; CHECK-OPAQ-NEXT:    .long 280 @ 0x118
  %a = getelementptr i16, i16* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
  %b = bitcast <4 x i16*> %a to <4 x i8*>
  %c = getelementptr i8, <4 x i8*> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188>
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
@ -62,9 +62,9 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB1_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -72,15 +72,15 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI1_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 9 @ 0x9
-; CHECK-NEXT:    .long 13 @ 0xd
-; CHECK-NEXT:    .long 17 @ 0x11
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 36 @ 0x24
+; CHECK-NEXT:    .long 52 @ 0x34
+; CHECK-NEXT:    .long 68 @ 0x44
 ; CHECK-NEXT:  .LCPI1_1:
-; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 15 @ 0xf
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 60 @ 0x3c
 vector.ph:
  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0
  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@ -173,10 +173,10 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias
 ; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q2, [r0, q0]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    vadd.i16 q2, q2, r2
-; CHECK-NEXT:    vstrh.16 q2, [r1, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q2, [r1, q1]
 ; CHECK-NEXT:    adds r1, #64
 ; CHECK-NEXT:    le lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -184,23 +184,23 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI3_0:
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 9 @ 0x9
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:    .short 25 @ 0x19
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 33 @ 0x21
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 18 @ 0x12
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 42 @ 0x2a
+; CHECK-NEXT:    .short 50 @ 0x32
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 66 @ 0x42
 ; CHECK-NEXT:  .LCPI3_1:
-; CHECK-NEXT:    .short 3 @ 0x3
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 15 @ 0xf
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 27 @ 0x1b
-; CHECK-NEXT:    .short 31 @ 0x1f
+; CHECK-NEXT:    .short 6 @ 0x6
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 30 @ 0x1e
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 54 @ 0x36
+; CHECK-NEXT:    .short 62 @ 0x3e
 vector.ph:
  %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0
  %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
@ -432,9 +432,9 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.f32 q2, q2, r2
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -442,15 +442,15 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI7_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 9 @ 0x9
-; CHECK-NEXT:    .long 13 @ 0xd
-; CHECK-NEXT:    .long 17 @ 0x11
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 36 @ 0x24
+; CHECK-NEXT:    .long 52 @ 0x34
+; CHECK-NEXT:    .long 68 @ 0x44
 ; CHECK-NEXT:  .LCPI7_1:
-; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 15 @ 0xf
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 60 @ 0x3c
 vector.ph:                                        ; preds = %entry
  %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0
  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
@ -549,9 +549,9 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.f16 q2, q2, r1
-; CHECK-NEXT:    vstrh.16 q2, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB9_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -559,23 +559,23 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI9_0:
-; CHECK-NEXT:    .short 3 @ 0x3
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 15 @ 0xf
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 27 @ 0x1b
-; CHECK-NEXT:    .short 31 @ 0x1f
+; CHECK-NEXT:    .short 6 @ 0x6
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 30 @ 0x1e
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 54 @ 0x36
+; CHECK-NEXT:    .short 62 @ 0x3e
 ; CHECK-NEXT:  .LCPI9_1:
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 9 @ 0x9
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:    .short 25 @ 0x19
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 33 @ 0x21
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 18 @ 0x12
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 42 @ 0x2a
+; CHECK-NEXT:    .short 50 @ 0x32
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 66 @ 0x42
 vector.ph:
  %y.trunc = fptrunc float %y to half
  %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0
@ -620,17 +620,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x,
 ; CHECK-NEXT:    movs r3, #10
 ; CHECK-NEXT:  .LBB10_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q3, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, q2, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, q2]
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i32 q3, q4, q3
 ; CHECK-NEXT:    add.w r0, r0, #48
 ; CHECK-NEXT:    vmul.i32 q5, q4, q5
 ; CHECK-NEXT:    vmul.i32 q4, q4, r3
 ; CHECK-NEXT:    vstrw.32 q4, [r1, q1, uxtw #2]
-; CHECK-NEXT:    vstrw.32 q5, [r1, q2, uxtw #2]
-; CHECK-NEXT:    vstrw.32 q3, [r1, q0, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q5, [r1, q2]
+; CHECK-NEXT:    vstrw.32 q3, [r1, q0]
 ; CHECK-NEXT:    add.w r1, r1, #48
 ; CHECK-NEXT:    bne .LBB10_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -639,20 +639,20 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x,
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI10_0:
-; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 4 @ 0x4
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 16 @ 0x10
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 40 @ 0x28
 ; CHECK-NEXT:  .LCPI10_1:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 3 @ 0x3
 ; CHECK-NEXT:    .long 6 @ 0x6
 ; CHECK-NEXT:    .long 9 @ 0x9
 ; CHECK-NEXT:  .LCPI10_2:
-; CHECK-NEXT:    .long 2 @ 0x2
-; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 32 @ 0x20
+; CHECK-NEXT:    .long 44 @ 0x2c
 vector.ph:
  br label %vector.body

@ -790,17 +790,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    movs r3, #10
 ; CHECK-NEXT:  .LBB12_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q3, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q3, [r0, q0]
 ; CHECK-NEXT:    vldrh.u16 q4, [r0, q1, uxtw #1]
-; CHECK-NEXT:    vldrh.u16 q5, [r0, q2, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q5, [r0, q2]
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i16 q3, q4, q3
 ; CHECK-NEXT:    add.w r0, r0, #48
 ; CHECK-NEXT:    vmul.i16 q5, q4, q5
 ; CHECK-NEXT:    vmul.i16 q4, q4, r3
 ; CHECK-NEXT:    vstrh.16 q4, [r1, q1, uxtw #1]
-; CHECK-NEXT:    vstrh.16 q5, [r1, q2, uxtw #1]
-; CHECK-NEXT:    vstrh.16 q3, [r1, q0, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q5, [r1, q2]
+; CHECK-NEXT:    vstrh.16 q3, [r1, q0]
 ; CHECK-NEXT:    add.w r1, r1, #48
 ; CHECK-NEXT:    bne .LBB12_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@ -809,14 +809,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI12_0:
-; CHECK-NEXT:    .short 1 @ 0x1
-; CHECK-NEXT:    .short 4 @ 0x4
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 10 @ 0xa
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 16 @ 0x10
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 2 @ 0x2
+; CHECK-NEXT:    .short 8 @ 0x8
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 20 @ 0x14
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 32 @ 0x20
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 44 @ 0x2c
 ; CHECK-NEXT:  .LCPI12_1:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 3 @ 0x3
@ -827,14 +827,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    .short 18 @ 0x12
 ; CHECK-NEXT:    .short 21 @ 0x15
 ; CHECK-NEXT:  .LCPI12_2:
-; CHECK-NEXT:    .short 2 @ 0x2
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 8 @ 0x8
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 14 @ 0xe
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
+; CHECK-NEXT:    .short 4 @ 0x4
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 16 @ 0x10
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 28 @ 0x1c
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
 vector.ph:
  br label %vector.body

--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@ -219,19 +219,19 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offp
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI9_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI9_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
  %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
@ -258,15 +258,15 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %of
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI16_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI16_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 16 @ 0x10
+; CHECK-NEXT:    .long 22 @ 0x16
+; CHECK-NEXT:    .long 28 @ 0x1c
 entry:
  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> <i16 0, i16 3, i16 6, i16 9>
  %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5