diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index f67bfbf5596b..30785340ef12 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -145,7 +145,8 @@ private:
   // Optimise the base and offsets of the given address
   bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
   // Try to fold consecutive geps together into one
-  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
+  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale,
+                 IRBuilder<> &Builder);
   // Check whether these offsets could be moved out of the loop they're in
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
@@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   return true;
 }
 
-static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
-                                      IRBuilder<> &Builder) {
+static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y,
+                                      unsigned ScaleY, IRBuilder<> &Builder) {
   // Splat the non-vector value to a vector of the given type - if the value is
   // a constant (and its value isn't too big), we can even use this opportunity
   // to scale it to the size of the vector elements
@@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
       ConstantInt *ConstYEl =
           dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
       if (!ConstXEl || !ConstYEl ||
-          ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+          ConstXEl->getZExtValue() * ScaleX +
+                  ConstYEl->getZExtValue() * ScaleY >=
               (unsigned)(1 << (TargetElemSize - 1)))
         return nullptr;
     }
   }
 
-  Value *Add = Builder.CreateAdd(X, Y);
+  Value *XScale = Builder.CreateVectorSplat(
+      XElType->getNumElements(),
+      Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX));
+  Value *YScale = Builder.CreateVectorSplat(
+      YElType->getNumElements(),
+      Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY));
+  Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale),
+                                 Builder.CreateMul(Y, YScale));
 
-  FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
-  if (checkOffsetSize(Add, GEPType->getNumElements()))
+  if (checkOffsetSize(Add, XElType->getNumElements()))
     return Add;
   else
     return nullptr;
 }
 
 Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
-                                         Value *&Offsets,
+                                         Value *&Offsets, unsigned &Scale,
                                          IRBuilder<> &Builder) {
   Value *GEPPtr = GEP->getPointerOperand();
   Offsets = GEP->getOperand(1);
+  Scale = DL->getTypeAllocSize(GEP->getSourceElementType());
   // We only merge geps with constant offsets, because only for those
   // we can make sure that we do not cause an overflow
-  if (!isa<Constant>(Offsets))
+  if (GEP->getNumIndices() != 1 || !isa<Constant>(Offsets))
     return nullptr;
-  GetElementPtrInst *BaseGEP;
-  if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+  if (GetElementPtrInst *BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr)) {
     // Merge the two geps into one
-    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder);
     if (!BaseBasePtr)
       return nullptr;
-    Offsets =
-        CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+    Offsets = CheckAndCreateOffsetAdd(
+        Offsets, Scale, GEP->getOperand(1),
+        DL->getTypeAllocSize(GEP->getSourceElementType()), Builder);
     if (Offsets == nullptr)
       return nullptr;
+    Scale = 1; // Scale is always an i8 at this point.
     return BaseBasePtr;
   }
   return GEPPtr;
@@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
     Builder.SetInsertPoint(GEP);
     Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
     Value *Offsets;
-    Value *Base = foldGEP(GEP, Offsets, Builder);
+    unsigned Scale;
+    Value *Base = foldGEP(GEP, Offsets, Scale, Builder);
     // We only want to merge the geps if there is a real chance that they can be
     // used by an MVE gather; thus the offset has to have the correct size
     // (always i32 if it is not of vector type) and the base has to be a
     // pointer.
     if (Offsets && Base && Base != GEP) {
+      assert(Scale == 1 && "Expected to fold GEP to a scale of 1");
+      Type *BaseTy = Builder.getInt8PtrTy();
+      if (auto *VecTy = dyn_cast<FixedVectorType>(Base->getType()))
+        BaseTy = FixedVectorType::get(BaseTy, VecTy);
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
-          GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
-      GEP->replaceAllUsesWith(NewAddress);
+          Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
+          "gep.merged", GEP);
+      LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
+                        << "\n      new :  " << *NewAddress << "\n");
+      GEP->replaceAllUsesWith(
+          Builder.CreateBitCast(NewAddress, GEP->getType()));
       GEP = NewAddress;
       Changed = true;
     }
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index a40e7b8af3e1..d6dddf756895 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -294,19 +294,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI14_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20
@@ -319,19 +319,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI15_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI15_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
index 9372a98b047a..a7261265552f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
@@ -318,15 +318,15 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI21_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI21_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 32 @ 0x20
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 56 @ 0x38
 entry:
   %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 81119a1a1f19..344cfd415037 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -856,15 +856,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(i8* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI31_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI31_0:
-; CHECK-OPAQ-NEXT:    .long 4294967295 @ 0xffffffff
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 31 @ 0x1f
-; CHECK-OPAQ-NEXT:    .long 47 @ 0x2f
+; CHECK-OPAQ-NEXT:    .long 4294967292 @ 0xfffffffc
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 28 @ 0x1c
+; CHECK-OPAQ-NEXT:    .long 44 @ 0x2c
   %a = getelementptr i8, i8* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
   %b = bitcast <4 x i8*> %a to <4 x i32*>
   %c = getelementptr inbounds i32, <4 x i32*> %b, i32 -1
@@ -892,15 +892,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(i16* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI32_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI32_0:
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 5 @ 0x5
-; CHECK-OPAQ-NEXT:    .long 29 @ 0x1d
-; CHECK-OPAQ-NEXT:    .long 235 @ 0xeb
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 18 @ 0x12
+; CHECK-OPAQ-NEXT:    .long 58 @ 0x3a
+; CHECK-OPAQ-NEXT:    .long 280 @ 0x118
   %a = getelementptr i16, i16* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
   %b = bitcast <4 x i16*> %a to <4 x i8*>
   %c = getelementptr i8, <4 x i8*> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188>
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
index a4007a1077ff..96a0b5332211 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
@@ -62,9 +62,9 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r2
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB1_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -72,15 +72,15 @@ define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI1_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 9 @ 0x9
-; CHECK-NEXT:    .long 13 @ 0xd
-; CHECK-NEXT:    .long 17 @ 0x11
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 36 @ 0x24
+; CHECK-NEXT:    .long 52 @ 0x34
+; CHECK-NEXT:    .long 68 @ 0x44
 ; CHECK-NEXT:  .LCPI1_1:
-; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 15 @ 0xf
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 60 @ 0x3c
 vector.ph:
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -173,10 +173,10 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias
 ; CHECK-NEXT:    vldrw.u32 q1, [r12]
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q2, [r0, q0]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    vadd.i16 q2, q2, r2
-; CHECK-NEXT:    vstrh.16 q2, [r1, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q2, [r1, q1]
 ; CHECK-NEXT:    adds r1, #64
 ; CHECK-NEXT:    le lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -184,23 +184,23 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI3_0:
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 9 @ 0x9
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:    .short 25 @ 0x19
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 33 @ 0x21
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 18 @ 0x12
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 42 @ 0x2a
+; CHECK-NEXT:    .short 50 @ 0x32
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 66 @ 0x42
 ; CHECK-NEXT:  .LCPI3_1:
-; CHECK-NEXT:    .short 3 @ 0x3
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 15 @ 0xf
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 27 @ 0x1b
-; CHECK-NEXT:    .short 31 @ 0x1f
+; CHECK-NEXT:    .short 6 @ 0x6
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 30 @ 0x1e
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 54 @ 0x36
+; CHECK-NEXT:    .short 62 @ 0x3e
 vector.ph:
   %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0
   %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -432,9 +432,9 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.f32 q2, q2, r2
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -442,15 +442,15 @@ define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noal
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI7_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 9 @ 0x9
-; CHECK-NEXT:    .long 13 @ 0xd
-; CHECK-NEXT:    .long 17 @ 0x11
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 36 @ 0x24
+; CHECK-NEXT:    .long 52 @ 0x34
+; CHECK-NEXT:    .long 68 @ 0x44
 ; CHECK-NEXT:  .LCPI7_1:
-; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 15 @ 0xf
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 60 @ 0x3c
 vector.ph:                                        ; preds = %entry
   %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0
   %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
@@ -549,9 +549,9 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q2, [r0, q0]
 ; CHECK-NEXT:    vadd.f16 q2, q2, r1
-; CHECK-NEXT:    vstrh.16 q2, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q2, [r0, q1]
 ; CHECK-NEXT:    adds r0, #64
 ; CHECK-NEXT:    le lr, .LBB9_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -559,23 +559,23 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI9_0:
-; CHECK-NEXT:    .short 3 @ 0x3
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 15 @ 0xf
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 27 @ 0x1b
-; CHECK-NEXT:    .short 31 @ 0x1f
+; CHECK-NEXT:    .short 6 @ 0x6
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 30 @ 0x1e
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 54 @ 0x36
+; CHECK-NEXT:    .short 62 @ 0x3e
 ; CHECK-NEXT:  .LCPI9_1:
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 9 @ 0x9
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:    .short 25 @ 0x19
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 33 @ 0x21
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 18 @ 0x12
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 42 @ 0x2a
+; CHECK-NEXT:    .short 50 @ 0x32
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 66 @ 0x42
 vector.ph:
   %y.trunc = fptrunc float %y to half
   %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0
@@ -620,17 +620,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x,
 ; CHECK-NEXT:    movs r3, #10
 ; CHECK-NEXT:  .LBB10_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q3, [r0, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, q2, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, q2]
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i32 q3, q4, q3
 ; CHECK-NEXT:    add.w r0, r0, #48
 ; CHECK-NEXT:    vmul.i32 q5, q4, q5
 ; CHECK-NEXT:    vmul.i32 q4, q4, r3
 ; CHECK-NEXT:    vstrw.32 q4, [r1, q1, uxtw #2]
-; CHECK-NEXT:    vstrw.32 q5, [r1, q2, uxtw #2]
-; CHECK-NEXT:    vstrw.32 q3, [r1, q0, uxtw #2]
+; CHECK-NEXT:    vstrw.32 q5, [r1, q2]
+; CHECK-NEXT:    vstrw.32 q3, [r1, q0]
 ; CHECK-NEXT:    add.w r1, r1, #48
 ; CHECK-NEXT:    bne .LBB10_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -639,20 +639,20 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x,
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI10_0:
-; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 4 @ 0x4
-; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 16 @ 0x10
+; CHECK-NEXT:    .long 28 @ 0x1c
+; CHECK-NEXT:    .long 40 @ 0x28
 ; CHECK-NEXT:  .LCPI10_1:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 3 @ 0x3
 ; CHECK-NEXT:    .long 6 @ 0x6
 ; CHECK-NEXT:    .long 9 @ 0x9
 ; CHECK-NEXT:  .LCPI10_2:
-; CHECK-NEXT:    .long 2 @ 0x2
-; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 32 @ 0x20
+; CHECK-NEXT:    .long 44 @ 0x2c
 vector.ph:
   br label %vector.body
 
@@ -790,17 +790,17 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    movs r3, #10
 ; CHECK-NEXT:  .LBB12_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q3, [r0, q0, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q3, [r0, q0]
 ; CHECK-NEXT:    vldrh.u16 q4, [r0, q1, uxtw #1]
-; CHECK-NEXT:    vldrh.u16 q5, [r0, q2, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q5, [r0, q2]
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i16 q3, q4, q3
 ; CHECK-NEXT:    add.w r0, r0, #48
 ; CHECK-NEXT:    vmul.i16 q5, q4, q5
 ; CHECK-NEXT:    vmul.i16 q4, q4, r3
 ; CHECK-NEXT:    vstrh.16 q4, [r1, q1, uxtw #1]
-; CHECK-NEXT:    vstrh.16 q5, [r1, q2, uxtw #1]
-; CHECK-NEXT:    vstrh.16 q3, [r1, q0, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q5, [r1, q2]
+; CHECK-NEXT:    vstrh.16 q3, [r1, q0]
 ; CHECK-NEXT:    add.w r1, r1, #48
 ; CHECK-NEXT:    bne .LBB12_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -809,14 +809,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI12_0:
-; CHECK-NEXT:    .short 1 @ 0x1
-; CHECK-NEXT:    .short 4 @ 0x4
-; CHECK-NEXT:    .short 7 @ 0x7
-; CHECK-NEXT:    .short 10 @ 0xa
-; CHECK-NEXT:    .short 13 @ 0xd
-; CHECK-NEXT:    .short 16 @ 0x10
-; CHECK-NEXT:    .short 19 @ 0x13
-; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 2 @ 0x2
+; CHECK-NEXT:    .short 8 @ 0x8
+; CHECK-NEXT:    .short 14 @ 0xe
+; CHECK-NEXT:    .short 20 @ 0x14
+; CHECK-NEXT:    .short 26 @ 0x1a
+; CHECK-NEXT:    .short 32 @ 0x20
+; CHECK-NEXT:    .short 38 @ 0x26
+; CHECK-NEXT:    .short 44 @ 0x2c
 ; CHECK-NEXT:  .LCPI12_1:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 3 @ 0x3
@@ -827,14 +827,14 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x,
 ; CHECK-NEXT:    .short 18 @ 0x12
 ; CHECK-NEXT:    .short 21 @ 0x15
 ; CHECK-NEXT:  .LCPI12_2:
-; CHECK-NEXT:    .short 2 @ 0x2
-; CHECK-NEXT:    .short 5 @ 0x5
-; CHECK-NEXT:    .short 8 @ 0x8
-; CHECK-NEXT:    .short 11 @ 0xb
-; CHECK-NEXT:    .short 14 @ 0xe
-; CHECK-NEXT:    .short 17 @ 0x11
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
+; CHECK-NEXT:    .short 4 @ 0x4
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 16 @ 0x10
+; CHECK-NEXT:    .short 22 @ 0x16
+; CHECK-NEXT:    .short 28 @ 0x1c
+; CHECK-NEXT:    .short 34 @ 0x22
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
 vector.ph:
   br label %vector.body
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
index c62a9fb64ef1..b2c7b22fb829 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -219,19 +219,19 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offp
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI9_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI9_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
   %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
index 1fbe8e594626..760dff52bff4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
@@ -258,15 +258,15 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %of
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI16_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI16_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 16 @ 0x10
+; CHECK-NEXT:    .long 22 @ 0x16
+; CHECK-NEXT:    .long 28 @ 0x1c
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> <i16 0, i16 3, i16 6, i16 9>
   %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5